diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -74,6 +74,7 @@ DS_READ, DS_WRITE, S_BUFFER_LOAD_IMM, + S_LOAD_IMM, BUFFER_LOAD, BUFFER_STORE, MIMG, @@ -325,6 +326,7 @@ switch (Opc) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_IMM: case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_STORE_DWORD: @@ -333,6 +335,7 @@ case AMDGPU::FLAT_STORE_DWORD: return 1; case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX2: @@ -348,6 +351,7 @@ case AMDGPU::FLAT_STORE_DWORDX3: return 3; case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::GLOBAL_LOAD_DWORDX4: case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4: @@ -356,6 +360,7 @@ case AMDGPU::FLAT_STORE_DWORDX4: return 4; case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: return 8; case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; @@ -428,6 +433,11 @@ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return S_BUFFER_LOAD_IMM; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: + return S_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: case AMDGPU::DS_READ_B64: @@ -499,6 +509,11 @@ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: + return AMDGPU::S_LOAD_DWORD_IMM; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX3: @@ -595,6 +610,10 @@ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -661,6 +680,7 @@ : 4; break; case S_BUFFER_LOAD_IMM: + case S_LOAD_IMM: EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); break; default: @@ -981,6 +1001,7 @@ default: return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); case S_BUFFER_LOAD_IMM: + case S_LOAD_IMM: switch (Width) { default: return false; @@ -1623,6 +1644,17 @@ case 8: return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; } + case S_LOAD_IMM: + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM; + } case GLOBAL_LOAD: switch (Width) { default: @@ -1731,7 +1763,7 @@ const TargetRegisterClass * SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, const CombineInfo &Paired) { - if (CI.InstClass == S_BUFFER_LOAD_IMM) { + if (CI.InstClass == S_BUFFER_LOAD_IMM || CI.InstClass == S_LOAD_IMM) { switch (CI.Width + Paired.Width) { default: return nullptr; @@ -2300,6 +2332,7 @@ NewMI = mergeWrite2Pair(CI, Paired, Where->I); break; case S_BUFFER_LOAD_IMM: + case S_LOAD_IMM: NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); OptimizeListAgain |= CI.Width + Paired.Width < 8; break; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -452,25 +452,24 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -527,18 +526,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -556,15 +556,13 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -643,28 +641,27 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -691,27 +688,26 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -739,25 +735,24 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -782,18 +777,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1 ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -811,15 +807,13 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -865,26 +859,25 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -909,27 +902,26 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v0, v0, 8, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -955,27 +947,26 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v0, v0, 16, 8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1001,26 +992,25 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -63,11 +63,10 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll @@ -11,18 +11,19 @@ ; GFX8V3-LABEL: addrspacecast: ; GFX8V3: ; %bb.0: ; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x44 -; GFX8V3-NEXT: s_load_dword s5, s[4:5], 0x40 +; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V3-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V3-NEXT: s_mov_b32 s2, s0 +; GFX8V3-NEXT: s_mov_b32 s4, s0 +; GFX8V3-NEXT: s_mov_b32 s5, s3 ; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V3-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8V3-NEXT: s_mov_b32 s4, s1 +; GFX8V3-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX8V3-NEXT: s_mov_b32 s6, s1 +; GFX8V3-NEXT: s_mov_b32 s7, s2 ; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V3-NEXT: v_mov_b32_e32 v0, s2 -; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX8V3-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V3-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GFX8V3-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V3-NEXT: flat_store_dword v[0:1], v2 ; GFX8V3-NEXT: s_waitcnt vmcnt(0) ; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 @@ -35,18 +36,19 @@ ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x44 -; GFX8V4-NEXT: s_load_dword s5, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V4-NEXT: s_mov_b32 s2, s0 +; GFX8V4-NEXT: s_mov_b32 s4, s0 +; GFX8V4-NEXT: s_mov_b32 s5, s3 ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V4-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8V4-NEXT: s_mov_b32 s4, s1 +; GFX8V4-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX8V4-NEXT: s_mov_b32 s6, s1 +; GFX8V4-NEXT: s_mov_b32 s7, s2 ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 -; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX8V4-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0 +; GFX8V4-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V4-NEXT: flat_store_dword v[0:1], v2 ; GFX8V4-NEXT: s_waitcnt vmcnt(0) ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 @@ -59,18 +61,18 @@ ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xc8 -; GFX8V5-NEXT: s_load_dword s5, s[4:5], 0xcc +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v2, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) -; GFX8V5-NEXT: s_mov_b32 s2, s0 +; GFX8V5-NEXT: s_mov_b32 s4, s0 +; GFX8V5-NEXT: s_mov_b32 s5, s2 ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V5-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8V5-NEXT: s_mov_b32 s4, s1 +; GFX8V5-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 +; GFX8V5-NEXT: s_mov_b32 s2, s1 ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 -; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0 -; GFX8V5-NEXT: v_mov_b32_e32 v1, s3 +; GFX8V5-NEXT: v_mov_b32_e32 v0, s4 +; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0 +; GFX8V5-NEXT: v_mov_b32_e32 v1, s5 ; GFX8V5-NEXT: flat_store_dword v[0:1], v2 ; GFX8V5-NEXT: s_waitcnt vmcnt(0) ; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -635,9 +635,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -646,10 +644,10 @@ ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; GFX1030-NEXT: v_mov_b32_e32 v3, s7 +; GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-NEXT: v_mov_b32_e32 v2, s2 +; GFX1030-NEXT: v_mov_b32_e32 v3, s3 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -660,26 +658,24 @@ ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s4 -; GFX1013-NEXT: v_mov_b32_e32 v1, s5 -; GFX1013-NEXT: v_mov_b32_e32 v2, s6 -; GFX1013-NEXT: v_mov_b32_e32 v3, s7 +; GFX1013-NEXT: v_mov_b32_e32 v0, s0 +; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v2, s2 +; GFX1013-NEXT: v_mov_b32_e32 v3, s3 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -692,18 +688,15 @@ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[8:11] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s8, 2.0 ; GFX11-NEXT: s_mov_b32 s9, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 ; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 @@ -713,23 +706,24 @@ ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: s_mov_b32 s7, 1.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9 -; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 +; GFX11-NEXT: s_mov_b32 s2, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[0:3] +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -756,126 +750,120 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: s_movk_i32 s9, 0x4600 ; GFX1030-NEXT: s_movk_i32 s8, 0x4700 ; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; GFX1030-NEXT: v_mov_b32_e32 v3, s7 -; GFX1030-NEXT: s_movk_i32 s5, 0x4400 +; GFX1030-NEXT: v_mov_b32_e32 v0, s0 +; GFX1030-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030-NEXT: v_mov_b32_e32 v2, s2 +; GFX1030-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-NEXT: s_movk_i32 s1, 0x4400 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: s_movk_i32 s6, 0x4200 +; GFX1030-NEXT: s_movk_i32 s2, 0x4200 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] -; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX1030-NEXT: s_movk_i32 s7, 0x4800 -; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 -; GFX1030-NEXT: s_movk_i32 s4, 0x4500 -; GFX1030-NEXT: s_or_b32 s5, s6, s5 -; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000 -; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX1030-NEXT: s_lshl_b32 s6, s6, 16 -; GFX1030-NEXT: s_lshl_b32 s7, s7, 16 -; GFX1030-NEXT: s_or_b32 s4, s4, s6 -; GFX1030-NEXT: s_or_b32 s6, s8, s7 +; GFX1030-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX1030-NEXT: s_movk_i32 s3, 0x4800 +; GFX1030-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX1030-NEXT: s_lshl_b32 s1, s1, 16 +; GFX1030-NEXT: s_movk_i32 s0, 0x4500 +; GFX1030-NEXT: s_or_b32 s1, s2, s1 +; GFX1030-NEXT: s_bfe_u32 s2, s9, 0x100000 +; GFX1030-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX1030-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX1030-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1030-NEXT: s_lshl_b32 s3, s3, 16 +; GFX1030-NEXT: s_or_b32 s0, s0, s2 +; GFX1030-NEXT: s_or_b32 s2, s8, s3 ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 -; GFX1030-NEXT: v_mov_b32_e32 v5, s5 -; GFX1030-NEXT: v_mov_b32_e32 v6, s4 -; GFX1030-NEXT: v_mov_b32_e32 v7, s6 +; GFX1030-NEXT: v_mov_b32_e32 v5, s1 +; GFX1030-NEXT: v_mov_b32_e32 v6, s0 +; GFX1030-NEXT: v_mov_b32_e32 v7, s2 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_clause 0x1 -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; GFX1013-NEXT: s_movk_i32 s9, 0x4600 +; GFX1013-NEXT: s_movk_i32 s8, 0x4700 +; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX1013-NEXT: s_waitcnt lgkmcnt(0) +; GFX1013-NEXT: v_mov_b32_e32 v0, s0 +; GFX1013-NEXT: v_mov_b32_e32 v1, s1 +; GFX1013-NEXT: v_mov_b32_e32 v2, s2 +; GFX1013-NEXT: v_mov_b32_e32 v3, s3 ; GFX1013-NEXT: s_movk_i32 s1, 0x4400 +; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo +; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 +; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1013-NEXT: s_movk_i32 s2, 0x4200 +; GFX1013-NEXT: flat_load_dword v0, v[4:5] +; GFX1013-NEXT: flat_load_dword v1, v[2:3] ; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX1013-NEXT: s_movk_i32 s3, 0x4800 ; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 ; GFX1013-NEXT: s_movk_i32 s0, 0x4500 ; GFX1013-NEXT: s_or_b32 s1, s2, s1 +; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 ; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 -; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s4 -; GFX1013-NEXT: v_mov_b32_e32 v1, s5 -; GFX1013-NEXT: v_mov_b32_e32 v2, s6 -; GFX1013-NEXT: v_mov_b32_e32 v3, s7 -; GFX1013-NEXT: s_movk_i32 s5, 0x4600 -; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo -; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 -; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX1013-NEXT: s_movk_i32 s4, 0x4700 -; GFX1013-NEXT: flat_load_dword v0, v[4:5] -; GFX1013-NEXT: flat_load_dword v1, v[2:3] -; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000 -; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 -; GFX1013-NEXT: v_mov_b32_e32 v2, 0 +; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 ; GFX1013-NEXT: s_or_b32 s0, s0, s2 -; GFX1013-NEXT: s_or_b32 s2, s4, s3 +; GFX1013-NEXT: s_or_b32 s2, s8, s3 +; GFX1013-NEXT: v_mov_b32_e32 v2, 0 ; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, s1 ; GFX1013-NEXT: v_mov_b32_e32 v6, s0 ; GFX1013-NEXT: v_mov_b32_e32 v7, s2 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s8, 2.0 ; GFX11-NEXT: s_mov_b32 s9, 0x42004600 ; GFX11-NEXT: s_mov_b32 s10, 0x44004700 ; GFX11-NEXT: s_mov_b32 s11, 0x45004800 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9 -; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 +; GFX11-NEXT: s_mov_b32 s2, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 ; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[0:3] a16 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -44,14 +44,12 @@ define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x3 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 -; GFX6-NEXT: s_bfe_i32 s3, s4, s3 +; GFX6-NEXT: s_bfe_i32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -64,14 +62,12 @@ define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x3 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 -; GFX6-NEXT: s_bfe_i32 s3, s4, s3 +; GFX6-NEXT: s_bfe_i32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -84,15 +80,13 @@ define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_i32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x3 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s4, s2, 63 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_bfe_i32 s3, 0x7b, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -105,10 +99,9 @@ define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 { ; GFX6-LABEL: v_bfe_print_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80002 @@ -125,13 +118,11 @@ define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x3 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 -; GFX6-NEXT: s_bfe_i32 s3, s4, s3 +; GFX6-NEXT: s_bfe_i32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -161,10 +152,9 @@ define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -183,10 +173,9 @@ define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -205,10 +194,9 @@ define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -227,10 +215,9 @@ define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1001f @@ -247,10 +234,9 @@ define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x1f0001 @@ -267,10 +253,9 @@ define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x180008 @@ -287,10 +272,9 @@ define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80018 @@ -307,10 +291,9 @@ define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s3, s3, 31 @@ -328,10 +311,9 @@ define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_i32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s3, s3, 31 @@ -666,10 +648,9 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_sext_in_reg_i24: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x180000 @@ -690,26 +671,25 @@ ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, 2.0 -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX6-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_lo_u32 v1, -2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s2, s2, 0x100001 -; GFX6-NEXT: s_ashr_i32 s3, s2, 31 +; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100001 +; GFX6-NEXT: s_ashr_i32 s2, s0, 31 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_add_i32 s2, s2, s3 -; GFX6-NEXT: s_xor_b32 s2, s2, s3 +; GFX6-NEXT: s_add_i32 s0, s0, s2 +; GFX6-NEXT: s_xor_b32 s0, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_subrev_i32_e64 v2, s[0:1], 2, v1 @@ -717,8 +697,8 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s3, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %src = load i32, i32 addrspace(1)* %in, align 4 @@ -731,10 +711,9 @@ define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_0_width: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 8 @@ -751,10 +730,9 @@ define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 @@ -773,10 +751,9 @@ define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_8_bfe_16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 @@ -796,10 +773,9 @@ define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: bfe_16_bfe_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100000 @@ -819,14 +795,12 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x3 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX6-NEXT: s_sext_i32_i8 s3, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -842,14 +816,12 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x3 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_add_i32 s3, s2, s3 ; GFX6-NEXT: s_bfe_i32 s3, s3, 8 ; GFX6-NEXT: s_sext_i32_i8 s3, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -865,17 +837,17 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %ptr, align 1 %sext = sext i8 %load to i32 @@ -889,17 +861,17 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 { ; GFX6-LABEL: sextload_i8_to_i32_bfe_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_sbyte v0, off, s[4:7], 0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_i32 v0, v0, 8, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %ptr, align 1 %sext = sext i8 %load to i32 @@ -913,10 +885,9 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x10000 @@ -936,10 +907,9 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20000 @@ -959,10 +929,9 @@ define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x20000 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -44,18 +44,16 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x3 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x2 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s1, s2, 63 -; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s0, s0, s1 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_and_b32 s4, s3, 63 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_bfe_u32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -65,14 +63,12 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_arg_imm: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x3 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 ; GFX6-NEXT: s_or_b32 s3, s3, 0x7b0000 -; GFX6-NEXT: s_bfe_u32 s3, s4, s3 +; GFX6-NEXT: s_bfe_u32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -85,14 +81,12 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_arg_imm_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x3 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s3, 59, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, s3 +; GFX6-NEXT: s_bfe_u32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -105,15 +99,13 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; GFX6-LABEL: bfe_u32_imm_arg_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x3 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s3, s3, 63 -; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_or_b32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s4, s2, 63 +; GFX6-NEXT: s_lshl_b32 s3, s3, 16 +; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_bfe_u32 s3, 0x7b, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -126,13 +118,11 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; GFX6-LABEL: bfe_u32_arg_0_width_reg_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x3 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_and_b32 s3, s3, 63 -; GFX6-NEXT: s_bfe_u32 s3, s4, s3 +; GFX6-NEXT: s_bfe_u32 s3, s2, s3 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -162,16 +152,16 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 8 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %in %ext = zext i8 %load to i32 @@ -184,10 +174,9 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 1 @@ -208,10 +197,9 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 1 @@ -232,10 +220,9 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 1 @@ -256,10 +243,9 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 1 @@ -280,10 +266,9 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i8_offset_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 1 @@ -304,10 +289,9 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zext_in_reg_i16_offset_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_add_i32 s3, s3, 1 @@ -328,10 +312,9 @@ define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_1: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x10000 @@ -348,10 +331,9 @@ define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_2: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -370,10 +352,9 @@ define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_3: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -392,10 +373,9 @@ define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_4: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x10000 @@ -415,10 +395,9 @@ define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_5: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_i32 s3, s3, 0x10000 @@ -438,10 +417,9 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_6: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -460,10 +438,9 @@ define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_7: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -482,10 +459,9 @@ define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, s3, 31 @@ -504,10 +480,9 @@ define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_9: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1001f @@ -524,10 +499,9 @@ define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_10: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x1f0001 @@ -544,10 +518,9 @@ define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_11: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x180008 @@ -564,10 +537,9 @@ define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_12: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x80018 @@ -585,10 +557,9 @@ define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_13: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s3, s3, 31 @@ -606,10 +577,9 @@ define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_test_14: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s3, s3, 31 @@ -948,21 +918,21 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s8, 63 -; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002 -; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: s_and_b32 s0, s8, 63 +; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 { @@ -995,13 +965,11 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GFX6-LABEL: v_lshr_and: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x2 -; GFX6-NEXT: s_load_dword s4, s[0:1], 0x3 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s3, s3, s4 +; GFX6-NEXT: s_lshr_b32 s3, s2, s3 ; GFX6-NEXT: s_and_b32 s3, s3, 7 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,8 +6,7 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s3 @@ -20,22 +19,18 @@ ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -622,57 +622,56 @@ define amdgpu_kernel void @sdivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_ashr_i32 s2, s10, 31 ; GFX8-NEXT: s_add_i32 s0, s10, s2 ; GFX8-NEXT: s_xor_b32 s3, s0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 -; GFX8-NEXT: s_ashr_i32 s12, s11, 31 -; GFX8-NEXT: s_add_i32 s0, s11, s12 +; GFX8-NEXT: s_ashr_i32 s10, s11, 31 +; GFX8-NEXT: s_add_i32 s0, s11, s10 +; GFX8-NEXT: s_xor_b32 s11, s0, s10 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_xor_b32 s11, s0, s12 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 -; GFX8-NEXT: s_ashr_i32 s10, s8, 31 +; GFX8-NEXT: s_sub_i32 s0, 0, s3 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX8-NEXT: s_ashr_i32 s12, s8, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s8, s10 -; GFX8-NEXT: s_xor_b32 s0, s0, s10 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: s_sub_i32 s8, 0, s11 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: s_sub_i32 s1, 0, s11 +; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX8-NEXT: s_add_i32 s0, s8, s12 +; GFX8-NEXT: s_xor_b32 s0, s0, s12 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: s_xor_b32 s0, s10, s2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v2, s1, v1 +; GFX8-NEXT: v_mul_lo_u32 v3, v0, s3 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s0, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v0 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 +; GFX8-NEXT: s_xor_b32 s0, s12, s2 ; GFX8-NEXT: s_ashr_i32 s2, s9, 31 ; GFX8-NEXT: s_add_i32 s1, s9, s2 -; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, s12, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s10, v2 ; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s12, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 @@ -683,7 +682,7 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s12 +; GFX8-NEXT: s_xor_b32 s0, s2, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 @@ -699,124 +698,121 @@ ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s10, 31 -; GFX9-NEXT: s_add_i32 s0, s10, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_ashr_i32 s4, s11, 31 -; GFX9-NEXT: s_add_i32 s5, s11, s4 +; GFX9-NEXT: s_ashr_i32 s8, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s8 +; GFX9-NEXT: s_xor_b32 s6, s6, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_ashr_i32 s9, s7, 31 +; GFX9-NEXT: s_add_i32 s7, s7, s9 +; GFX9-NEXT: s_xor_b32 s7, s7, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s11, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_i32 s12, 0, s6 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_ashr_i32 s10, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s10 -; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s8, s8, s10 +; GFX9-NEXT: s_sub_i32 s12, 0, s7 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s11, 0, s5 -; GFX9-NEXT: v_mul_lo_u32 v3, s11, v1 -; GFX9-NEXT: s_ashr_i32 s11, s9, 31 +; GFX9-NEXT: s_ashr_i32 s11, s5, 31 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 +; GFX9-NEXT: s_add_i32 s5, s5, s11 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_add_i32 s9, s9, s11 -; GFX9-NEXT: s_xor_b32 s9, s9, s11 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: s_xor_b32 s5, s5, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s6, s10, s6 -; GFX9-NEXT: s_xor_b32 s4, s11, s4 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: s_xor_b32 s4, s10, s8 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: s_xor_b32 s4, s11, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 ; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s8, s2, 31 -; GFX10-NEXT: s_ashr_i32 s9, s3, 31 -; GFX10-NEXT: s_add_i32 s2, s2, s8 -; GFX10-NEXT: s_add_i32 s3, s3, s9 -; GFX10-NEXT: s_xor_b32 s2, s2, s8 -; GFX10-NEXT: s_xor_b32 s3, s3, s9 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX10-NEXT: s_ashr_i32 s1, s10, 31 +; GFX10-NEXT: s_ashr_i32 s2, s11, 31 +; GFX10-NEXT: s_add_i32 s0, s10, s1 +; GFX10-NEXT: s_add_i32 s3, s11, s2 +; GFX10-NEXT: s_xor_b32 s10, s0, s1 +; GFX10-NEXT: s_xor_b32 s3, s3, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 -; GFX10-NEXT: s_sub_i32 s7, 0, s3 -; GFX10-NEXT: s_ashr_i32 s10, s0, 31 +; GFX10-NEXT: s_sub_i32 s0, 0, s10 +; GFX10-NEXT: s_sub_i32 s11, 0, s3 +; GFX10-NEXT: s_ashr_i32 s12, s9, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_ashr_i32 s11, s1, 31 -; GFX10-NEXT: s_add_i32 s0, s0, s10 -; GFX10-NEXT: s_add_i32 s1, s1, s11 -; GFX10-NEXT: s_xor_b32 s0, s0, s10 -; GFX10-NEXT: s_xor_b32 s1, s1, s11 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 +; GFX10-NEXT: s_ashr_i32 s11, s8, 31 +; GFX10-NEXT: s_add_i32 s0, s8, s11 +; GFX10-NEXT: s_add_i32 s8, s9, s12 +; GFX10-NEXT: s_xor_b32 s0, s0, s11 +; GFX10-NEXT: s_xor_b32 s8, s8, s12 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_xor_b32 s1, s11, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s1, v3 -; GFX10-NEXT: s_xor_b32 s1, s10, s8 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -824,25 +820,24 @@ ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s11, s9 +; GFX10-NEXT: s_xor_b32 s0, s12, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s11, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -528,8 +528,7 @@ define amdgpu_kernel void @udivrem_v2i32(<2 x i32> addrspace(1)* %out0, <2 x i32> addrspace(1)* %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 @@ -583,106 +582,102 @@ ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s2 -; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX9-NEXT: s_sub_i32 s0, 0, s10 +; GFX9-NEXT: s_sub_i32 s1, 0, s11 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 -; GFX10-NEXT: s_sub_i32 s7, 0, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX10-NEXT: s_sub_i32 s0, 0, s10 +; GFX10-NEXT: s_sub_i32 s1, 0, s11 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] ; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -274,14 +274,13 @@ ; FIXEDABI-SDAG-LABEL: addrspacecast_requires_queue_ptr: ; FIXEDABI-SDAG: ; %bb.0: ; FIXEDABI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIXEDABI-SDAG-NEXT: s_load_dword s4, s[6:7], 0x44 -; FIXEDABI-SDAG-NEXT: s_load_dword s5, s[6:7], 0x40 +; FIXEDABI-SDAG-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x40 ; FIXEDABI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; FIXEDABI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v2, s5 ; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc -; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s5 +; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, s4 ; FIXEDABI-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 ; FIXEDABI-SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v0, vcc ; FIXEDABI-SDAG-NEXT: v_mov_b32_e32 v0, 1 @@ -296,14 +295,13 @@ ; FIXEDABI-GISEL-LABEL: addrspacecast_requires_queue_ptr: ; FIXEDABI-GISEL: ; %bb.0: ; FIXEDABI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; FIXEDABI-GISEL-NEXT: s_load_dword s4, s[6:7], 0x44 -; FIXEDABI-GISEL-NEXT: s_load_dword s5, s[6:7], 0x40 +; FIXEDABI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x40 ; FIXEDABI-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, -1, v0 ; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc ; FIXEDABI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v3, s4 -; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v4, s5 +; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v0, s5 +; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; FIXEDABI-GISEL-NEXT: v_mov_b32_e32 v4, s4 ; FIXEDABI-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, -1, v1 ; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; FIXEDABI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -522,50 +522,49 @@ ; GFX908-LABEL: introduced_copy_to_sgpr: ; GFX908: ; %bb.0: ; %bb ; GFX908-NEXT: global_load_ushort v24, v[0:1], off glc -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX908-NEXT: s_load_dword s6, s[4:5], 0x18 -; GFX908-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x10 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 +; GFX908-NEXT: s_load_dword s5, s[4:5], 0x18 +; GFX908-NEXT: s_mov_b32 s4, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX908-NEXT: s_sub_i32 s7, 0, s1 -; GFX908-NEXT: s_lshr_b32 s5, s6, 16 -; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s6 +; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX908-NEXT: s_sub_i32 s6, 0, s3 +; GFX908-NEXT: s_lshl_b64 s[8:9], s[10:11], 5 +; GFX908-NEXT: s_lshr_b32 s12, s5, 16 ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX908-NEXT: s_lshl_b64 s[10:11], s[2:3], 5 -; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s5 -; GFX908-NEXT: s_or_b32 s10, s10, 28 +; GFX908-NEXT: v_cvt_f32_f16_e32 v25, s5 +; GFX908-NEXT: v_cvt_f32_f16_e32 v26, s12 +; GFX908-NEXT: s_or_b32 s8, s8, 28 ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX908-NEXT: v_mov_b32_e32 v7, s3 -; GFX908-NEXT: s_mov_b32 s4, 0 -; GFX908-NEXT: v_mov_b32_e32 v6, s2 -; GFX908-NEXT: v_mul_lo_u32 v2, s7, v0 -; GFX908-NEXT: s_lshl_b64 s[6:7], s[8:9], 5 +; GFX908-NEXT: v_mov_b32_e32 v6, s10 +; GFX908-NEXT: v_mov_b32_e32 v7, s11 +; GFX908-NEXT: v_mul_lo_u32 v2, s6, v0 +; GFX908-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 ; GFX908-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX908-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX908-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s10 -; GFX908-NEXT: v_mov_b32_e32 v3, s11 -; GFX908-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX908-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, s8 +; GFX908-NEXT: v_mov_b32_e32 v3, s9 +; GFX908-NEXT: v_mul_lo_u32 v4, v0, s3 ; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX908-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 +; GFX908-NEXT: v_sub_u32_e32 v4, s2, v4 +; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX908-NEXT: v_subrev_u32_e32 v5, s1, v4 +; GFX908-NEXT: v_subrev_u32_e32 v5, s3, v4 ; GFX908-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX908-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 +; GFX908-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX908-NEXT: v_lshlrev_b64 v[4:5], 5, v[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s0, v24 -; GFX908-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX908-NEXT: s_mul_i32 s1, s9, s0 -; GFX908-NEXT: s_mul_hi_u32 s5, s8, s0 -; GFX908-NEXT: s_mul_i32 s0, s8, s0 -; GFX908-NEXT: s_add_i32 s1, s5, s1 +; GFX908-NEXT: v_readfirstlane_b32 s2, v24 +; GFX908-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX908-NEXT: s_mul_i32 s1, s1, s2 +; GFX908-NEXT: s_mul_hi_u32 s3, s0, s2 +; GFX908-NEXT: s_mul_i32 s0, s0, s2 +; GFX908-NEXT: s_add_i32 s1, s3, s1 ; GFX908-NEXT: s_lshl_b64 s[8:9], s[0:1], 5 ; GFX908-NEXT: s_branch .LBB3_2 ; GFX908-NEXT: .LBB3_1: ; %bb12 @@ -662,50 +661,49 @@ ; GFX90A-LABEL: introduced_copy_to_sgpr: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: global_load_ushort v28, v[0:1], off glc -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x10 -; GFX90A-NEXT: s_load_dword s3, s[4:5], 0x18 +; GFX90A-NEXT: s_load_dword s7, s[4:5], 0x18 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_mov_b32 s6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX90A-NEXT: s_sub_i32 s12, 0, s7 -; GFX90A-NEXT: s_lshr_b32 s13, s3, 16 -; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX90A-NEXT: s_sub_i32 s12, 0, s3 +; GFX90A-NEXT: s_lshr_b32 s13, s7, 16 +; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s13 +; GFX90A-NEXT: s_lshl_b64 s[4:5], s[0:1], 5 ; GFX90A-NEXT: s_lshl_b64 s[10:11], s[8:9], 5 -; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: s_mov_b32 s2, 0 +; GFX90A-NEXT: s_or_b32 s10, s10, 28 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[8:9], s[8:9] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[10:11], s[10:11] op_sel:[0,1] ; GFX90A-NEXT: v_mul_lo_u32 v8, s12, v0 ; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 ; GFX90A-NEXT: v_add_u32_e32 v0, v0, v8 -; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 -; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s7 -; GFX90A-NEXT: v_sub_u32_e32 v8, s6, v8 +; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s3 +; GFX90A-NEXT: v_sub_u32_e32 v8, s2, v8 ; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc -; GFX90A-NEXT: v_subrev_u32_e32 v9, s7, v8 +; GFX90A-NEXT: v_subrev_u32_e32 v9, s3, v8 ; GFX90A-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; GFX90A-NEXT: v_add_u32_e32 v9, 1, v0 -; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s7, v8 +; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v8 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc ; GFX90A-NEXT: v_lshlrev_b64 v[8:9], 5, v[0:1] ; GFX90A-NEXT: v_pk_mov_b32 v[10:11], 0, 0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s3, v28 -; GFX90A-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX90A-NEXT: s_mul_i32 s1, s1, s3 -; GFX90A-NEXT: s_mul_hi_u32 s6, s0, s3 -; GFX90A-NEXT: s_mul_i32 s0, s0, s3 -; GFX90A-NEXT: s_add_i32 s1, s6, s1 -; GFX90A-NEXT: s_lshl_b64 s[6:7], s[0:1], 5 +; GFX90A-NEXT: v_readfirstlane_b32 s2, v28 +; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX90A-NEXT: s_mul_i32 s1, s1, s2 +; GFX90A-NEXT: s_mul_hi_u32 s3, s0, s2 +; GFX90A-NEXT: s_mul_i32 s0, s0, s2 +; GFX90A-NEXT: s_add_i32 s1, s3, s1 +; GFX90A-NEXT: s_lshl_b64 s[2:3], s[0:1], 5 ; GFX90A-NEXT: s_branch .LBB3_2 ; GFX90A-NEXT: .LBB3_1: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -720,38 +718,38 @@ ; GFX90A-NEXT: ; %bb.3: ; %bb14 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off -; GFX90A-NEXT: s_mov_b32 s3, s2 -; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[2:3], s[2:3] op_sel:[0,1] -; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_mov_b32 s7, s6 +; GFX90A-NEXT: v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[20:21], s[6:7], s[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[4:5] ; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[16:17], v[16:17] op_sel:[0,1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_readfirstlane_b32 s3, v12 +; GFX90A-NEXT: v_readfirstlane_b32 s7, v12 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v13 -; GFX90A-NEXT: s_add_u32 s3, s3, 1 +; GFX90A-NEXT: s_add_u32 s7, s7, 1 ; GFX90A-NEXT: s_addc_u32 s9, s8, 0 -; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s3 -; GFX90A-NEXT: s_mul_i32 s11, s5, s3 -; GFX90A-NEXT: s_mul_i32 s8, s4, s3 -; GFX90A-NEXT: s_mul_i32 s3, s4, s9 -; GFX90A-NEXT: s_add_i32 s3, s10, s3 -; GFX90A-NEXT: s_add_i32 s3, s3, s11 +; GFX90A-NEXT: s_mul_hi_u32 s10, s4, s7 +; GFX90A-NEXT: s_mul_i32 s11, s5, s7 +; GFX90A-NEXT: s_mul_i32 s8, s4, s7 +; GFX90A-NEXT: s_mul_i32 s7, s4, s9 +; GFX90A-NEXT: s_add_i32 s7, s10, s7 +; GFX90A-NEXT: s_add_i32 s7, s7, s11 ; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_4: ; %bb58 ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v12, vcc, v12, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc -; GFX90A-NEXT: v_mov_b32_e32 v24, s7 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s6, v14 +; GFX90A-NEXT: v_mov_b32_e32 v24, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s2, v14 ; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v24, vcc ; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[12:13] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: .LBB3_5: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_mov_b32_e32 v25, s3 +; GFX90A-NEXT: v_mov_b32_e32 v25, s7 ; GFX90A-NEXT: v_add_co_u32_e32 v24, vcc, s8, v14 ; GFX90A-NEXT: v_addc_co_u32_e32 v25, vcc, v15, v25, vcc ; GFX90A-NEXT: global_load_dword v30, v[24:25], off offset:-12 glc diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll --- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll +++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll @@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) ; GCN-LABEL: readfirstlane_uniform -; GCN: s_load_dwordx2 s[[[IN_ADDR:[0-9]+]]:1], s[4:5], 0x0 +; GCN: s_load_dwordx4 s[[[IN_ADDR:[0-9]+]]:3], s[4:5], 0x0 ; GCN: v_readfirstlane_b32 s[[SCALAR:[0-9]+]], v0 ; GCN: s_add_u32 s[[LOAD_ADDR:[0-9]+]], s[[IN_ADDR]], s[[SCALAR]] ; GCN: s_load_dword s{{[0-9]+}}, s[[[LOAD_ADDR]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -39,17 +39,18 @@ ; ; GFX6-LABEL: udiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -63,15 +64,13 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 @@ -133,36 +132,36 @@ ; ; GFX6-LABEL: urem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_sub_i32 s2, 0, s5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s3, v0 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX9-NEXT: s_sub_i32 s4, 0, s3 @@ -235,7 +234,7 @@ ; ; GFX6-LABEL: sdiv_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -244,21 +243,22 @@ ; GFX6-NEXT: s_xor_b32 s3, s3, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_ashr_i32 s9, s2, 31 +; GFX6-NEXT: s_add_i32 s2, s2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_xor_b32 s2, s2, s9 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_add_i32 s1, s2, s0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: s_xor_b32 s2, s0, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 @@ -266,17 +266,15 @@ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 @@ -355,8 +353,7 @@ ; ; GFX6-LABEL: srem_i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: s_add_i32 s3, s3, s4 @@ -390,9 +387,8 @@ ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: s_add_i32 s3, s3, s4 @@ -4592,32 +4588,33 @@ ; ; GFX6-LABEL: udiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s3, s2, 0x7fff -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 -; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 +; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4641,39 +4638,38 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff -; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff +; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f -; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 @@ -4779,56 +4775,57 @@ ; ; GFX6-LABEL: urem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX6-NEXT: s_bfe_u32 s9, s2, 0xf000f -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 +; GFX6-NEXT: s_lshr_b32 s5, s8, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s3, s2, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_lshr_b32 s4, s6, 15 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 ; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 @@ -4836,35 +4833,34 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 @@ -4890,9 +4886,9 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX9-NEXT: s_lshr_b32 s0, s2, 15 +; GFX9-NEXT: s_lshr_b32 s0, s6, 15 ; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] @@ -4988,48 +4984,49 @@ ; ; GFX6-LABEL: sdiv_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 -; GFX6-NEXT: v_mov_b32_e32 v1, s0 -; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 -; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s1, s1, s3 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s4, s4, 1 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf000f ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 -; GFX6-NEXT: s_or_b32 s1, s1, 1 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc -; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf000f ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 30 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 ; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 +; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 -; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_mov_b32_e32 v6, s0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_or_b32 s4, s4, 1 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v6, s4 ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -5051,28 +5048,27 @@ ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 -; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 ; GFX9-NEXT: s_or_b32 s3, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -5081,24 +5077,24 @@ ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX9-NEXT: s_cselect_b32 s0, s3, 0 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf000f +; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f +; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s5, v1, 30 ; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 30 +; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 @@ -5121,9 +5117,9 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[6:7] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, <3 x i15> addrspace(1)* %out @@ -5215,52 +5211,53 @@ ; ; GFX6-LABEL: srem_v3i15: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s9, s2, 0xf0000 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_bfe_i32 s9, s6, 0xf0000 ; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 -; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 -; GFX6-NEXT: s_xor_b32 s1, s9, s1 -; GFX6-NEXT: s_ashr_i32 s1, s1, 30 -; GFX6-NEXT: s_or_b32 s1, s1, 1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s1 -; GFX6-NEXT: s_lshr_b32 s8, s0, 15 -; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX6-NEXT: s_xor_b32 s5, s9, s5 +; GFX6-NEXT: s_ashr_i32 s5, s5, 30 +; GFX6-NEXT: s_or_b32 s5, s5, 1 ; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v6, v6 ; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX6-NEXT: v_mov_b32_e32 v7, s5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 -; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, s8 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s5 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_lshr_b32 s4, s6, 15 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 +; GFX6-NEXT: s_bfe_i32 s6, s6, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: s_xor_b32 s0, s1, s0 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 +; GFX6-NEXT: s_xor_b32 s5, s6, s5 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 -; GFX6-NEXT: s_ashr_i32 s0, s0, 30 ; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v7, v7 ; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 +; GFX6-NEXT: s_ashr_i32 s5, s5, 30 ; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| ; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: s_or_b32 s0, s0, 1 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: v_mov_b32_e32 v8, s0 +; GFX6-NEXT: s_or_b32 s5, s5, 1 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: v_mov_b32_e32 v8, s5 ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 @@ -5275,57 +5272,56 @@ ; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GFX6-NEXT: s_lshr_b32 s7, s8, 15 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, s8 +; GFX6-NEXT: v_mul_lo_u32 v5, v5, s7 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: s_lshr_b32 s3, s2, 15 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v5 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v5 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 ; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 ; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 -; GFX9-NEXT: s_lshr_b32 s8, s2, 15 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: s_lshr_b32 s8, s6, 15 ; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 ; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_lshr_b32 s3, s6, 15 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 +; GFX9-NEXT: s_lshr_b32 s3, s2, 15 ; GFX9-NEXT: s_or_b32 s7, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s7, 0 ; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 -; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f +; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f +; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 @@ -5336,12 +5332,12 @@ ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 -; GFX9-NEXT: s_or_b32 s6, s0, 1 +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: s_or_b32 s2, s0, 1 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s6, 0 +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 ; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 ; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 @@ -5359,7 +5355,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 @@ -5460,27 +5456,27 @@ ; ; GFX6-LABEL: udiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_add_i32 s5, s5, 12 -; GFX6-NEXT: s_lshr_b32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_add_i32 s0, s3, 12 +; GFX6-NEXT: s_lshr_b32 s0, s2, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s3, 12 -; GFX9-NEXT: s_lshr_b32 s0, s2, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_add_i32 s3, s3, 12 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -5501,29 +5497,29 @@ ; ; GFX6-LABEL: udiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshr_b32 s4, s4, 12 -; GFX6-NEXT: s_lshr_b32 s5, s5, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_lshr_b32 s0, s2, 12 +; GFX6-NEXT: s_lshr_b32 s1, s3, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 12 -; GFX9-NEXT: s_lshr_b32 s1, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b32 s3, s3, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -5543,37 +5539,37 @@ ; ; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX6-NEXT: s_lshr_b32 s4, s4, 12 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_lshr_b32 s0, s2, 12 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s1, s3, 0x100101 -; GFX9-NEXT: s_lshr_b32 s0, s2, 12 -; GFX9-NEXT: s_sub_i32 s2, s3, s1 -; GFX9-NEXT: s_lshr_b32 s2, s2, 1 -; GFX9-NEXT: s_add_i32 s2, s2, s1 -; GFX9-NEXT: s_lshr_b32 s1, s2, 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_add_i32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b32 s2, s2, 12 +; GFX9-NEXT: s_lshr_b32 s3, s3, 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -5843,29 +5839,29 @@ ; ; GFX6-LABEL: urem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 -; GFX6-NEXT: s_add_i32 s5, s5, -1 -; GFX6-NEXT: s_and_b32 s4, s4, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s3 +; GFX6-NEXT: s_add_i32 s0, s0, -1 +; GFX6-NEXT: s_and_b32 s0, s2, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s3 -; GFX9-NEXT: s_add_i32 s0, s0, -1 -; GFX9-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 +; GFX9-NEXT: s_add_i32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -5886,29 +5882,29 @@ ; ; GFX6-LABEL: urem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s4, 0xfff -; GFX6-NEXT: s_and_b32 s5, s5, 0xfff -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_and_b32 s0, s2, 0xfff +; GFX6-NEXT: s_and_b32 s1, s3, 0xfff +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xfff -; GFX9-NEXT: s_and_b32 s1, s3, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xfff +; GFX9-NEXT: s_and_b32 s3, s3, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -6170,7 +6166,7 @@ ; ; GFX6-LABEL: sdiv_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -6180,21 +6176,22 @@ ; GFX6-NEXT: s_xor_b32 s3, s3, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX6-NEXT: s_sub_i32 s4, 0, s3 +; GFX6-NEXT: s_ashr_i32 s9, s2, 31 +; GFX6-NEXT: s_add_i32 s2, s2, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_xor_b32 s2, s2, s9 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_xor_b32 s8, s9, s8 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_ashr_i32 s0, s2, 31 -; GFX6-NEXT: s_add_i32 s1, s2, s0 +; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s1, s1, s0 -; GFX6-NEXT: s_xor_b32 s2, s0, s8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 @@ -6202,17 +6199,15 @@ ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -6264,41 +6259,41 @@ ; ; GFX6-LABEL: sdiv_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s6, s4, 31 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_lshr_b32 s6, s6, 20 -; GFX6-NEXT: s_add_i32 s4, s4, s6 -; GFX6-NEXT: s_lshr_b32 s6, s7, 20 -; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_ashr_i32 s4, s4, 12 -; GFX6-NEXT: s_ashr_i32 s5, s5, 12 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_ashr_i32 s0, s2, 31 +; GFX6-NEXT: s_ashr_i32 s1, s3, 31 +; GFX6-NEXT: s_lshr_b32 s0, s0, 20 +; GFX6-NEXT: s_lshr_b32 s1, s1, 20 +; GFX6-NEXT: s_add_i32 s0, s2, s0 +; GFX6-NEXT: s_add_i32 s1, s3, s1 +; GFX6-NEXT: s_ashr_i32 s0, s0, 12 +; GFX6-NEXT: s_ashr_i32 s1, s1, 12 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s2, 31 -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s2, s0 -; GFX9-NEXT: s_add_i32 s1, s3, s1 -; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_ashr_i32 s1, s1, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_add_i32 s3, s3, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 +; GFX9-NEXT: s_ashr_i32 s3, s3, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -6318,43 +6313,43 @@ ; ; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 -; GFX6-NEXT: s_ashr_i32 s6, s4, 31 -; GFX6-NEXT: s_lshr_b32 s6, s6, 20 -; GFX6-NEXT: s_add_i32 s4, s4, s6 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 -; GFX6-NEXT: s_ashr_i32 s4, s4, 12 +; GFX6-NEXT: v_mul_hi_i32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_ashr_i32 s0, s2, 31 +; GFX6-NEXT: s_lshr_b32 s0, s0, 20 +; GFX6-NEXT: s_add_i32 s0, s2, s0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s3, v0 +; GFX6-NEXT: s_ashr_i32 s0, s0, 12 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 +; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s2, 31 -; GFX9-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s0, s2, s0 -; GFX9-NEXT: s_lshr_b32 s2, s1, 31 -; GFX9-NEXT: s_ashr_i32 s1, s1, 11 -; GFX9-NEXT: s_ashr_i32 s0, s0, 12 -; GFX9-NEXT: s_add_i32 s1, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_i32 s5, s5, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s3, s5, 31 +; GFX9-NEXT: s_ashr_i32 s4, s5, 11 +; GFX9-NEXT: s_ashr_i32 s2, s2, 12 +; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -6686,8 +6681,7 @@ ; ; GFX6-LABEL: srem_i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 @@ -6722,9 +6716,8 @@ ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -6775,45 +6768,45 @@ ; ; GFX6-LABEL: srem_v2i32_pow2k_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s6, s4, 31 -; GFX6-NEXT: s_lshr_b32 s6, s6, 20 -; GFX6-NEXT: s_add_i32 s6, s4, s6 -; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GFX6-NEXT: s_sub_i32 s4, s4, s6 -; GFX6-NEXT: s_lshr_b32 s6, s7, 20 -; GFX6-NEXT: s_add_i32 s6, s5, s6 -; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 -; GFX6-NEXT: v_mov_b32_e32 v1, s5 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: s_ashr_i32 s0, s2, 31 +; GFX6-NEXT: s_ashr_i32 s1, s3, 31 +; GFX6-NEXT: s_lshr_b32 s0, s0, 20 +; GFX6-NEXT: s_lshr_b32 s1, s1, 20 +; GFX6-NEXT: s_add_i32 s0, s2, s0 +; GFX6-NEXT: s_add_i32 s1, s3, s1 +; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s0, s2, s0 +; GFX6-NEXT: s_sub_i32 s1, s3, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s0, s2, 31 -; GFX9-NEXT: s_ashr_i32 s1, s3, 31 -; GFX9-NEXT: s_lshr_b32 s0, s0, 20 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s0, s2, s0 -; GFX9-NEXT: s_add_i32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 -; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s0, s2, s0 -; GFX9-NEXT: s_sub_i32 s1, s3, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: s_ashr_i32 s4, s2, 31 +; GFX9-NEXT: s_ashr_i32 s5, s3, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_lshr_b32 s5, s5, 20 +; GFX9-NEXT: s_add_i32 s4, s2, s4 +; GFX9-NEXT: s_add_i32 s5, s3, s5 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -28,7 +28,7 @@ ; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120 ; HSA-PROMOTE: .end_amd_kernel_code_t -; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2 +; HSA-PROMOTE: s_load_dwordx2 s[{{[0-9:]+}}], s[4:5], 0x1 ; SI-PROMOTE: ds_write_b32 ; SI-PROMOTE: ds_write_b32 diff --git a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll --- a/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll @@ -2,8 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32: -; GCN: s_load_dwordx4 -; GCN-DAG: s_load_dwordx4 +; GCN: s_load_dwordx8 ; GCN-DAG: s_load_dword ; GCN: {{buffer|flat}}_store_byte diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -45,9 +45,9 @@ } ; GCN-LABEL: {{^}}s_ubfe_sub_i32: -; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}} -; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] -; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}} +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]] +; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[#LOAD + 2]], [[SUB]] ; GCN: s_lshr_b32 s{{[0-9]+}}, [[TMP]], [[SUB]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -60,9 +60,9 @@ } ; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32: -; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}} -; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] -; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}} +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]] +; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[#LOAD + 2]], [[SUB]] ; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]] define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -119,9 +119,9 @@ } ; GCN-LABEL: {{^}}s_sbfe_sub_i32: -; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}} -; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] -; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}} +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]] +; GCN: s_lshl_b32 [[TMP:s[0-9]+]], s[[#LOAD + 2]], [[SUB]] ; GCN: s_ashr_i32 s{{[0-9]+}}, [[TMP]], [[SUB]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -134,9 +134,9 @@ } ; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32: -; GCN: s_load_dwordx2 s[[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]], s[0:1], {{0xb|0x2c}} -; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]] -; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]] +; GCN: s_load_dwordx4 s[[[#LOAD:]]:[[END:[0-9]+]]], s[0:1], {{0x9|0x24}} +; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[#LOAD + 3]] +; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[#LOAD + 2]], [[SUB]] ; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]] define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -11,77 +11,74 @@ define amdgpu_kernel void @s_bfi_def_i32(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_def_i32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s6, s6, s4 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s6, s4 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_andn2_b32 s4, s8, s6 +; GFX7-NEXT: s_and_b32 s5, s7, s6 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_def_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s4, s4, s2 -; GFX8-NEXT: s_and_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_and_b32 s1, s7, s6 +; GFX8-NEXT: s_andn2_b32 s0, s0, s6 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_def_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s4, s4, s2 -; GFX10-NEXT: s_and_b32 s2, s3, s2 -; GFX10-NEXT: s_or_b32 s2, s4, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_and_b32 s1, s7, s6 +; GFX10-NEXT: s_andn2_b32 s0, s0, s6 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_def_i32: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_andn2_b32 s4, s4, s2 -; GFX8-GISEL-NEXT: s_and_b32 s2, s3, s2 -; GFX8-GISEL-NEXT: s_or_b32 s2, s4, s2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6 +; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6 +; GFX8-GISEL-NEXT: s_or_b32 s0, s0, s1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8-GISEL-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_bfi_def_i32: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_clause 0x2 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_andn2_b32 s4, s4, s2 -; GFX10-GISEL-NEXT: s_and_b32 s2, s3, s2 -; GFX10-GISEL-NEXT: s_or_b32 s2, s4, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6 +; GFX10-GISEL-NEXT: s_andn2_b32 s0, s0, s6 +; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm entry: %0 = xor i32 %x, -1 @@ -137,77 +134,74 @@ define amdgpu_kernel void @s_bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ch: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s4, s4, s5 -; GFX7-NEXT: s_xor_b32 s4, s6, s4 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_xor_b32 s4, s7, s8 +; GFX7-NEXT: s_and_b32 s4, s6, s4 +; GFX7-NEXT: s_xor_b32 s4, s8, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ch: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s3, s3, s4 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_xor_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_xor_b32 s1, s7, s0 +; GFX8-NEXT: s_and_b32 s1, s6, s1 +; GFX8-NEXT: s_xor_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s3, s3, s4 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_xor_b32 s2, s4, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_xor_b32 s1, s7, s0 +; GFX10-NEXT: s_and_b32 s1, s6, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ch: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_xor_b32 s3, s3, s4 -; GFX8-GISEL-NEXT: s_and_b32 s2, s2, s3 -; GFX8-GISEL-NEXT: s_xor_b32 s2, s4, s2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0 +; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s1 +; GFX8-GISEL-NEXT: s_xor_b32 s0, s0, s1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8-GISEL-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_bfi_sha256_ch: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_clause 0x2 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_xor_b32 s3, s3, s4 -; GFX10-GISEL-NEXT: s_and_b32 s2, s2, s3 -; GFX10-GISEL-NEXT: s_xor_b32 s2, s4, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0 +; GFX10-GISEL-NEXT: s_and_b32 s1, s6, s1 +; GFX10-GISEL-NEXT: s_xor_b32 s0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm entry: %0 = xor i32 %y, %z @@ -478,82 +472,79 @@ define amdgpu_kernel void @s_bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) { ; GFX7-LABEL: s_bfi_sha256_ma: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GFX7-NEXT: s_load_dword s6, s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_or_b32 s4, s4, s6 -; GFX7-NEXT: s_and_b32 s4, s5, s4 -; GFX7-NEXT: s_or_b32 s4, s7, s4 +; GFX7-NEXT: s_mov_b32 s1, s5 +; GFX7-NEXT: s_or_b32 s5, s6, s8 +; GFX7-NEXT: s_mov_b32 s0, s4 +; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_and_b32 s5, s7, s5 +; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: s_bfi_sha256_ma: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s2, s4 -; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s2, s3, s2 -; GFX8-NEXT: s_or_b32 s2, s5, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_and_b32 s1, s6, s0 +; GFX8-NEXT: s_or_b32 s0, s6, s0 +; GFX8-NEXT: s_and_b32 s0, s7, s0 +; GFX8-NEXT: s_or_b32 s0, s1, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_bfi_sha256_ma: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_or_b32 s5, s2, s4 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s3, s3, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_or_b32 s1, s6, s0 +; GFX10-NEXT: s_and_b32 s0, s6, s0 +; GFX10-NEXT: s_and_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX8-GISEL-LABEL: s_bfi_sha256_ma: ; GFX8-GISEL: ; %bb.0: ; %entry -; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX8-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-GISEL-NEXT: s_and_b32 s5, s2, s4 -; GFX8-GISEL-NEXT: s_or_b32 s2, s2, s4 -; GFX8-GISEL-NEXT: s_and_b32 s2, s3, s2 -; GFX8-GISEL-NEXT: s_or_b32 s2, s5, s2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0 +; GFX8-GISEL-NEXT: s_or_b32 s0, s6, s0 +; GFX8-GISEL-NEXT: s_and_b32 s0, s7, s0 +; GFX8-GISEL-NEXT: s_or_b32 s0, s1, s0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2 ; GFX8-GISEL-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_bfi_sha256_ma: ; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_clause 0x2 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dword s4, s[0:1], 0x34 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_clause 0x1 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_or_b32 s5, s2, s4 -; GFX10-GISEL-NEXT: s_and_b32 s2, s2, s4 -; GFX10-GISEL-NEXT: s_and_b32 s3, s3, s5 -; GFX10-GISEL-NEXT: s_or_b32 s2, s2, s3 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0 +; GFX10-GISEL-NEXT: s_and_b32 s0, s6, s0 +; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s1 +; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm entry: %0 = and i32 %x, %z diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -5,20 +5,20 @@ define amdgpu_kernel void @s_bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { ; SI-LABEL: s_bfm_pattern: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfm_b32 s4, s4, s5 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bfm_b32 s2, s2, s3 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_bfm_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -64,42 +64,45 @@ define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v0, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 -; FLAT-NEXT: s_mov_b32 s6, s2 -; FLAT-NEXT: s_mov_b32 s7, s3 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_mov_b32 s10, s6 +; FLAT-NEXT: s_mov_b32 s11, s7 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s8, s2 +; FLAT-NEXT: s_mov_b32 s9, s3 +; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; FLAT-NEXT: s_mov_b32 s4, s0 +; FLAT-NEXT: s_mov_b32 s5, s1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 +; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -161,27 +164,27 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -196,9 +199,8 @@ ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -222,36 +224,37 @@ define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 { ; SI-LABEL: s_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s5, s5 -; SI-NEXT: s_brev_b32 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_brev_b32 s0, s3 +; SI-NEXT: s_brev_b32 s1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b32 s5, s5 -; FLAT-NEXT: s_brev_b32 s4, s4 -; FLAT-NEXT: v_mov_b32_e32 v0, s4 -; FLAT-NEXT: v_mov_b32_e32 v1, s5 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FLAT-NEXT: s_mov_b32 s4, s0 +; FLAT-NEXT: s_mov_b32 s5, s1 +; FLAT-NEXT: s_brev_b32 s0, s3 +; FLAT-NEXT: s_brev_b32 s1, s2 +; FLAT-NEXT: v_mov_b32_e32 v0, s1 +; FLAT-NEXT: v_mov_b32_e32 v1, s0 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b32 s2, s2 ; GISEL-NEXT: s_brev_b32 s3, s3 @@ -269,28 +272,28 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v1, v1 ; SI-NEXT: v_bfrev_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -306,9 +309,8 @@ ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -333,34 +335,35 @@ define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 { ; SI-LABEL: s_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_brev_b64 s[0:1], s[2:3] +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5] -; FLAT-NEXT: v_mov_b32_e32 v0, s4 -; FLAT-NEXT: v_mov_b32_e32 v1, s5 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FLAT-NEXT: s_mov_b32 s4, s0 +; FLAT-NEXT: s_mov_b32 s5, s1 +; FLAT-NEXT: s_brev_b64 s[0:1], s[2:3] +; FLAT-NEXT: v_mov_b32_e32 v0, s0 +; FLAT-NEXT: v_mov_b32_e32 v1, s1 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] ; GISEL-NEXT: v_mov_b32_e32 v0, s2 @@ -377,28 +380,28 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v2, v0 ; SI-NEXT: v_bfrev_b32_e32 v1, v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -414,9 +417,8 @@ ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -494,30 +496,30 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v4, v2 ; SI-NEXT: v_bfrev_b32_e32 v3, v3 ; SI-NEXT: v_bfrev_b32_e32 v2, v0 ; SI-NEXT: v_bfrev_b32_e32 v1, v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s3 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -535,9 +537,8 @@ ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 -; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -11,15 +11,12 @@ ; DBG-LABEL: cluster_load_cluster_store: -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 +; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 -; DBG: Cluster ld/st SU(1) - SU(2) - ; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]]) ; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]]) ; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]]) @@ -33,17 +30,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dword v2, v[0:1] ; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 ; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: flat_store_dword v[0:1], v3 offset:8 @@ -53,20 +49,18 @@ ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: s_add_u32 s6, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s7, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -76,16 +70,16 @@ ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_add_u32 s0, s2, 8 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s4, 16 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_add_u32 s0, s2, 16 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_add_u32 s2, s2, 24 +; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 @@ -102,17 +96,15 @@ ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) @@ -147,15 +139,12 @@ ; DBG-LABEL: cluster_load_valu_cluster_store: -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 -; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 +; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 ; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 -; DBG: Cluster ld/st SU(1) - SU(2) - ; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]]) ; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]]) ; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]]) @@ -169,17 +158,16 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_load_dword v2, v[0:1] ; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 ; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 @@ -190,20 +178,18 @@ ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: s_add_u32 s6, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_addc_u32 s7, s3, 0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_add_u32 s4, s0, 8 +; GFX10-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: s_add_u32 s6, s0, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: s_addc_u32 s7, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_add_u32 s0, s0, 24 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: flat_load_dword v6, v[2:3] @@ -213,18 +199,18 @@ ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s4, 8 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: s_add_u32 s0, s2, 8 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: s_add_u32 s4, s2, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s3, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_addc_u32 s5, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_add_u32 s0, s2, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -240,17 +226,15 @@ ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8 ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -650,37 +650,36 @@ define amdgpu_kernel void @sub_zext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_zext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] ; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -700,37 +699,36 @@ define amdgpu_kernel void @sub_sext_setcc_commute(i32 addrspace(1)* nocapture %arg, i32 %a, i32%b) { ; GCN-LABEL: sub_sext_setcc_commute: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] ; GCN-NEXT: buffer_load_dword v4, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s1, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 ; GCN-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[2:3] +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s5, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[2:3] +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -106,28 +106,28 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -163,9 +163,8 @@ ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -177,9 +176,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -191,9 +189,8 @@ ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -214,30 +211,30 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v1, 32, v1 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -278,9 +275,8 @@ ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] @@ -294,9 +290,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] @@ -310,9 +305,8 @@ ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] @@ -336,16 +330,18 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -355,15 +351,13 @@ ; SI-NEXT: v_min_u32_e32 v2, 32, v2 ; SI-NEXT: v_min_u32_e32 v1, 32, v1 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -414,9 +408,8 @@ ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] @@ -434,9 +427,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] @@ -454,9 +446,8 @@ ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] @@ -485,39 +476,43 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: v_add_u16_e32 v0, -8, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i8: @@ -552,9 +547,8 @@ ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -567,9 +561,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -581,9 +574,8 @@ ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -706,32 +698,34 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 -; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf -; SI-NEXT: s_flbit_i32_b32 s5, s5 -; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_flbit_i32_b32 s0, s2 +; SI-NEXT: s_min_u32 s0, s0, 0xffffffdf +; SI-NEXT: s_flbit_i32_b32 s1, s3 +; SI-NEXT: s_add_i32 s0, s0, 32 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: v_min3_u32 v0, s0, v0, 64 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b32 s4, s4 -; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp -; VI-NEXT: s_flbit_i32_b32 s4, s5 -; VI-NEXT: v_min3_u32 v0, v0, s4, 64 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_flbit_i32_b32 s0, s2 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_add_u32_e64 v0, s[0:1], s0, 32 clamp +; VI-NEXT: s_flbit_i32_b32 s0, s3 +; VI-NEXT: v_min3_u32 v0, v0, s0, 64 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_i64_trunc: @@ -753,36 +747,30 @@ ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b32 s0, s2 -; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp -; GFX10-NEXT: s_flbit_i32_b32 s0, s3 -; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_flbit_i32_b32 s2, s2 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s2, 32 clamp +; GFX10-NEXT: s_flbit_i32_b32 s2, s3 +; GFX10-NEXT: v_min3_u32 v0, v0, s2, 64 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clz_i32_u32 s2, s2 @@ -803,14 +791,15 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v2, v2 ; SI-NEXT: v_min_u32_e32 v2, 0xffffffdf, v2 @@ -818,15 +807,13 @@ ; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_min3_u32 v2, v2, v3, 64 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -873,9 +860,8 @@ ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -889,9 +875,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -906,9 +891,8 @@ ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -933,30 +917,29 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v3 ; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; SI-NEXT: v_ffbh_u32_e32 v3, v4 ; SI-NEXT: v_min3_u32 v0, v0, v3, 64 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -1003,9 +986,8 @@ ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] @@ -1019,9 +1001,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] @@ -1036,9 +1017,8 @@ ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] @@ -1064,27 +1044,27 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1121,9 +1101,8 @@ ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1134,9 +1113,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1150,9 +1128,8 @@ ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1173,27 +1150,27 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1230,9 +1207,8 @@ ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1243,9 +1219,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1259,9 +1234,8 @@ ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1283,30 +1257,30 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1348,9 +1322,8 @@ ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1364,9 +1337,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1380,9 +1352,8 @@ ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1407,30 +1378,30 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1472,9 +1443,8 @@ ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1488,9 +1458,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1504,9 +1473,8 @@ ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1531,25 +1499,25 @@ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1591,9 +1559,8 @@ ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1603,9 +1570,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 @@ -1624,9 +1590,7 @@ ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] @@ -1648,39 +1612,43 @@ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i16_sel_eq_neg1: @@ -1712,9 +1680,8 @@ ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1728,9 +1695,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1745,9 +1711,8 @@ ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1773,26 +1738,26 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1835,9 +1800,8 @@ ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1848,9 +1812,8 @@ ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 @@ -1871,9 +1834,7 @@ ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -70,27 +70,27 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -124,9 +124,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -145,28 +144,28 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -202,9 +201,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] @@ -224,30 +222,30 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_ffbh_u32_e32 v2, v2 ; SI-NEXT: v_ffbh_u32_e32 v1, v1 ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -287,9 +285,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] @@ -311,26 +308,26 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -376,9 +373,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 @@ -468,23 +464,23 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s2, s4 -; SI-NEXT: s_flbit_i32_b32 s4, s5 +; SI-NEXT: s_flbit_i32_b32 s2, s2 +; SI-NEXT: s_flbit_i32_b32 s3, s3 ; SI-NEXT: s_add_i32 s2, s2, 32 -; SI-NEXT: s_min_u32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_min_u32 s2, s2, s3 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_flbit_i32_b32 s2, s2 ; VI-NEXT: s_flbit_i32_b32 s3, s3 @@ -513,13 +509,12 @@ ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -530,29 +525,28 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v2, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; SI-NEXT: v_ffbh_u32_e32 v3, v3 ; SI-NEXT: v_min_u32_e32 v2, v2, v3 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -597,9 +591,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -622,29 +615,28 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v3 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; SI-NEXT: v_ffbh_u32_e32 v3, v4 ; SI-NEXT: v_min_u32_e32 v0, v0, v3 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -689,9 +681,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] @@ -715,27 +706,27 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -770,9 +761,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -795,27 +785,27 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -850,9 +840,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -875,25 +864,25 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -935,9 +924,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 @@ -965,32 +953,32 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1038,9 +1026,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1069,29 +1056,29 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1128,9 +1115,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1154,29 +1140,29 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1213,9 +1199,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1239,29 +1224,29 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1299,9 +1284,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1325,29 +1309,29 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1385,9 +1369,8 @@ ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -72,27 +72,27 @@ define amdgpu_kernel void @v_ctpop_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -142,49 +142,49 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in0, i16 addrspace(1)* noalias %in1) nounwind { ; SI-LABEL: v_ctpop_add_chain_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 glc +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_add_chain_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_ushort v1, v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_add_chain_i16: @@ -239,39 +239,39 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %sval) nounwind { ; SI-LABEL: v_ctpop_add_sgpr_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 -; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_add_sgpr_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_add_sgpr_i16: @@ -320,16 +320,18 @@ define amdgpu_kernel void @v_ctpop_v2i16(<2 x i16> addrspace(1)* noalias %out, <2 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -337,15 +339,13 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -400,16 +400,18 @@ define amdgpu_kernel void @v_ctpop_v4i16(<4 x i16> addrspace(1)* noalias %out, <4 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v4i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -423,15 +425,13 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -524,16 +524,18 @@ define amdgpu_kernel void @v_ctpop_v8i16(<8 x i16> addrspace(1)* noalias %out, <8 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v8i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -559,15 +561,13 @@ ; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -704,17 +704,19 @@ define amdgpu_kernel void @v_ctpop_v16i16(<16 x i16> addrspace(1)* noalias %out, <16 x i16> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_v16i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 -; SI-NEXT: v_mov_b32_e32 v5, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 offset:16 -; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: v_mov_b32_e32 v5, 0 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -765,16 +767,14 @@ ; SI-NEXT: v_or_b32_e32 v6, v10, v17 ; SI-NEXT: v_or_b32_e32 v5, v9, v18 ; SI-NEXT: v_or_b32_e32 v4, v8, v19 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 @@ -1020,27 +1020,27 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1092,27 +1092,27 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1164,28 +1164,28 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctpop_i16_add_literal: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_movk_i32 s4, 0x3e7 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_movk_i32 s0, 0x3e7 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s4, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1238,39 +1238,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_var(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 -; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_var: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_var: @@ -1319,39 +1319,39 @@ define amdgpu_kernel void @v_ctpop_i16_add_var_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 %const) nounwind { ; SI-LABEL: v_ctpop_i16_add_var_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s12, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s8 -; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_var_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_var_inv: @@ -1400,45 +1400,45 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in, i16 addrspace(1)* noalias %constptr) nounwind { ; SI-LABEL: v_ctpop_i16_add_vvar_inv: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[2:3], s[14:15] +; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctpop_i16_add_vvar_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_vvar_inv: diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -91,28 +91,28 @@ define amdgpu_kernel void @v_cttz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -148,9 +148,8 @@ ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -162,9 +161,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -184,30 +182,30 @@ define amdgpu_kernel void @v_cttz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v1, v1 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v1, 32, v1 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -248,9 +246,8 @@ ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] @@ -264,9 +261,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] @@ -288,16 +284,18 @@ define amdgpu_kernel void @v_cttz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v3, v3 ; SI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -307,15 +305,13 @@ ; SI-NEXT: v_min_u32_e32 v2, 32, v2 ; SI-NEXT: v_min_u32_e32 v1, 32, v1 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -366,9 +362,8 @@ ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] @@ -386,9 +381,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] @@ -414,36 +408,40 @@ define amdgpu_kernel void @v_cttz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, 0x100, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x100, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i8: @@ -477,9 +475,8 @@ ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -490,9 +487,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -593,32 +589,34 @@ define amdgpu_kernel void @s_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s5, s5 -; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf -; SI-NEXT: s_add_i32 s5, s5, 32 -; SI-NEXT: s_ff1_i32_b32 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_ff1_i32_b32 s0, s3 +; SI-NEXT: s_min_u32 s0, s0, 0xffffffdf +; SI-NEXT: s_add_i32 s0, s0, 32 +; SI-NEXT: s_ff1_i32_b32 s1, s2 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_min3_u32 v0, s1, v0, 64 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b32 s5, s5 -; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp -; VI-NEXT: s_ff1_i32_b32 s4, s4 -; VI-NEXT: v_min3_u32 v0, s4, v0, 64 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_ff1_i32_b32 s0, s3 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_add_u32_e64 v0, s[0:1], s0, 32 clamp +; VI-NEXT: s_ff1_i32_b32 s0, s2 +; VI-NEXT: v_min3_u32 v0, s0, v0, 64 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_i64_trunc: @@ -640,29 +638,25 @@ ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b32 s0, s3 -; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp -; GFX10-NEXT: s_ff1_i32_b32 s0, s2 -; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 -; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_ff1_i32_b32 s3, s3 +; GFX10-NEXT: s_ff1_i32_b32 s2, s2 +; GFX10-NEXT: v_add_nc_u32_e64 v0, s3, 32 clamp +; GFX10-NEXT: v_min3_u32 v0, s2, v0, 64 +; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_clause 0x1 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] +; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) %trunc = trunc i64 %cttz to i32 @@ -673,14 +667,15 @@ define amdgpu_kernel void @v_cttz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_cttz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v3, v3 ; SI-NEXT: v_min_u32_e32 v3, 0xffffffdf, v3 @@ -688,15 +683,13 @@ ; SI-NEXT: v_ffbl_b32_e32 v2, v2 ; SI-NEXT: v_min3_u32 v2, v2, v3, 64 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -743,9 +736,8 @@ ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -759,9 +751,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -785,30 +776,29 @@ define amdgpu_kernel void @v_cttz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_cttz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v4 ; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; SI-NEXT: v_ffbl_b32_e32 v3, v3 ; SI-NEXT: v_min3_u32 v0, v3, v0, 64 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s3 @@ -855,9 +845,8 @@ ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] @@ -871,9 +860,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] @@ -898,27 +886,27 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -955,9 +943,8 @@ ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -968,9 +955,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -994,27 +980,27 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1051,9 +1037,8 @@ ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1064,9 +1049,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1091,30 +1075,30 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1156,9 +1140,8 @@ ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1172,9 +1155,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1198,30 +1180,30 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1263,9 +1245,8 @@ ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1279,9 +1260,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -1305,25 +1285,25 @@ define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1365,9 +1345,8 @@ ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1377,9 +1356,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 @@ -1407,39 +1385,43 @@ define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s8, s2 +; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i16_sel_eq_neg1: @@ -1471,9 +1453,8 @@ ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1487,9 +1468,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1512,26 +1492,26 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1574,9 +1554,8 @@ ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1587,9 +1566,8 @@ ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -67,27 +67,27 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -121,9 +121,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] @@ -142,28 +141,28 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v1, v1 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -199,9 +198,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] @@ -221,30 +219,30 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_cttz_zero_undef_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v3, v3 ; SI-NEXT: v_ffbl_b32_e32 v2, v2 ; SI-NEXT: v_ffbl_b32_e32 v1, v1 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -284,9 +282,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] @@ -503,24 +500,24 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind { ; SI-LABEL: s_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s5, s5 -; SI-NEXT: s_ff1_i32_b32 s4, s4 -; SI-NEXT: s_add_i32 s5, s5, 32 -; SI-NEXT: s_min_u32 s4, s4, s5 +; SI-NEXT: s_ff1_i32_b32 s3, s3 +; SI-NEXT: s_ff1_i32_b32 s2, s2 +; SI-NEXT: s_add_i32 s3, s3, 32 +; SI-NEXT: s_min_u32 s2, s2, s3 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_ff1_i32_b32 s3, s3 @@ -551,15 +548,14 @@ ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[2:3] -; GFX9-GISEL-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x200000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] +; GFX9-GISEL-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x200000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -571,26 +567,27 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i8_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -635,9 +632,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -658,15 +654,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i16_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -674,14 +673,12 @@ ; SI-NEXT: v_ffbl_b32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -734,9 +731,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -759,17 +755,20 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i32_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 -; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 -; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -782,13 +781,12 @@ ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 ; SI-NEXT: v_min_u32_e32 v0, 32, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -848,9 +846,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -879,21 +876,24 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_zero_undef_i64_with_select: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 -; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 -; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:4 -; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:5 -; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:6 -; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:7 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:4 +; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:5 +; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:6 +; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:7 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; SI-NEXT: s_waitcnt vmcnt(4) @@ -916,13 +916,12 @@ ; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 ; SI-NEXT: v_min3_u32 v0, v0, v1, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 5 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1025,9 +1024,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 @@ -1070,17 +1068,20 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 -; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 -; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1092,13 +1093,12 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1159,9 +1159,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -1191,17 +1190,20 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 -; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 -; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1213,13 +1215,12 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1280,9 +1281,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -1312,17 +1312,20 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:3 -; SI-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 -; SI-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:2 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 +; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1337,13 +1340,12 @@ ; SI-NEXT: v_min_u32_e32 v0, 32, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 3 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1409,9 +1411,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 @@ -1441,24 +1442,25 @@ define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1503,9 +1505,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] @@ -1528,28 +1529,29 @@ define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; SI-LABEL: v_cttz_i16_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:1 -; SI-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_ffbl_b32_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 1 ; VI-NEXT: s_addc_u32 s5, s3, 0 @@ -1603,9 +1605,8 @@ ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -948,25 +948,25 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -981,9 +981,8 @@ ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -993,22 +992,18 @@ ; ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] @@ -1028,28 +1023,28 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1065,9 +1060,8 @@ ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] @@ -1079,24 +1073,21 @@ ; ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] @@ -1117,30 +1108,30 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1157,9 +1148,8 @@ ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1172,25 +1162,22 @@ ; ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1211,30 +1198,30 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1252,9 +1239,8 @@ ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -1268,26 +1254,23 @@ ; ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] @@ -1314,19 +1297,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -1335,15 +1320,13 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1373,9 +1356,8 @@ ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 @@ -1396,15 +1378,14 @@ ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1413,15 +1394,13 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -1460,10 +1439,13 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 ; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 @@ -1473,7 +1455,6 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 @@ -1494,18 +1475,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 @@ -1515,8 +1498,7 @@ ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; VI-NEXT: v_add_u16_e32 v8, 9, v4 ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1525,7 +1507,7 @@ ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: @@ -1535,10 +1517,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] -; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1557,8 +1537,8 @@ ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] -; GFX10-NEXT: global_store_dword v4, v5, s[4:5] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dword v4, v5, s[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: @@ -1569,8 +1549,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_movk_i32 s4, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v4 @@ -1599,9 +1578,7 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1631,8 +1608,8 @@ ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX11-NEXT: global_store_b32 v4, v5, s[0:1] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b32 v4, v5, s[2:3] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -1649,22 +1626,24 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:6 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[8:11], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[8:11], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v9, v[0:1], s[8:11], 0 addr64 offset:6 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -1679,17 +1658,15 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], 0 offset:24 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1733,9 +1710,8 @@ ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 @@ -1764,17 +1740,16 @@ ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:4 -; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] offset:3 -; GFX9-NEXT: global_load_ubyte v7, v0, s[0:1] offset:2 -; GFX9-NEXT: global_load_ubyte v8, v0, s[0:1] offset:1 -; GFX9-NEXT: global_load_ubyte v9, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 +; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -1788,16 +1763,14 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 @@ -1836,16 +1809,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -1855,16 +1830,14 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1887,9 +1860,8 @@ ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] @@ -1908,12 +1880,11 @@ ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -1923,16 +1894,14 @@ ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 ; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] @@ -1961,28 +1930,28 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1998,9 +1967,8 @@ ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -2012,24 +1980,21 @@ ; ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2052,27 +2017,27 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2087,9 +2052,8 @@ ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -2100,23 +2064,20 @@ ; ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2139,25 +2100,25 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2172,9 +2133,8 @@ ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2184,22 +2144,18 @@ ; ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] @@ -2220,19 +2176,21 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v2 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -2241,15 +2199,13 @@ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2279,9 +2235,8 @@ ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 @@ -2302,15 +2257,14 @@ ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[0:1] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2319,15 +2273,13 @@ ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 @@ -2357,27 +2309,27 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2392,9 +2344,8 @@ ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -2405,23 +2356,20 @@ ; ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2441,27 +2389,27 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2476,9 +2424,8 @@ ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -2489,23 +2436,20 @@ ; ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2526,27 +2470,27 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2561,9 +2505,8 @@ ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -2574,23 +2517,20 @@ ; ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2611,27 +2551,27 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -2646,9 +2586,8 @@ ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] @@ -2659,23 +2598,20 @@ ; ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v1, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2750,8 +2686,7 @@ ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll b/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll --- a/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll +++ b/llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll @@ -1,8 +1,11 @@ ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}name:{{[ ]*}}vector_clause -; GCN: LOAD_DWORDX2 -; GCN-NEXT: LOAD_DWORDX2 +; GCN: S_LOAD_DWORDX4 +; GCN: GLOBAL_LOAD_DWORDX4_SADDR +; GCN: GLOBAL_LOAD_DWORDX4_SADDR +; GCN: GLOBAL_LOAD_DWORDX4_SADDR +; GCN: GLOBAL_LOAD_DWORDX4_SADDR ; GCN-NEXT: KILL define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { bb: @@ -23,8 +26,8 @@ %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14 %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16 %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14 - store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16 store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16 store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16 store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16 ret void diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -4,15 +4,16 @@ define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s2, s4, s5 -; GCN-NEXT: s_sext_i32_i8 s4, s2 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_sext_i32_i8 s2, s2 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 24 @@ -24,15 +25,16 @@ define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s4, s4, s5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_add_i32 s0, s2, s3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %c = add i32 %a, %b ; add to prevent folding into extload @@ -46,15 +48,16 @@ define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s2, s4, s5 -; GCN-NEXT: s_sext_i32_i16 s4, s2 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_add_i32 s2, s2, s3 +; GCN-NEXT: s_sext_i32_i16 s2, s2 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %c = add i32 %a, %b ; add to prevent folding into extload %shl = shl i32 %c, 16 @@ -66,15 +69,16 @@ define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_add_i32 s4, s4, s5 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_add_i32 s0, s2, s3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %c = add i32 %a, %b ; add to prevent folding into extload diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -301,9 +301,8 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 @@ -320,9 +319,8 @@ ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 @@ -354,9 +352,8 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 { ; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s2 @@ -373,9 +370,8 @@ ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -58,8 +58,7 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3f16: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 +; GCN: s_load_dwordx4 ; GCN: buffer_store_short ; GCN: buffer_store_short @@ -75,11 +74,9 @@ ; FIXME: Why sometimes vector shift? ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16: ; SI: s_load_dword s -; SI: s_load_dwordx2 s -; SI: s_load_dwordx2 s +; SI: s_load_dwordx4 s -; GFX89: s_load_dwordx2 s -; GFX89: s_load_dwordx2 s +; GFX89: s_load_dwordx4 s ; GFX89: s_load_dword s diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -61,8 +61,7 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v3i16: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 +; GCN: s_load_dwordx4 ; GCN-NOT: {{buffer|flat|global}}_load @@ -78,14 +77,14 @@ } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; SI: s_load_dwordx2 +; SI: s_load_dwordx4 ; SI: buffer_store_short ; SI: buffer_store_short -; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[0:1], 0x2c -; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[LOAD0]] +; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[0:1], 0x24 +; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]] ; GFX89-DAG: buffer_store_short [[VLOAD0]], off -; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[LOAD1]] +; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]] ; GFX89-DAG: buffer_store_short [[VLOAD1]], off define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { %p0 = extractelement <4 x i16> %foo, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -131,43 +131,40 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { ; CI-LABEL: s_fabs_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s1, s1, 0x7fff7fff -; CI-NEXT: s_and_b32 s0, s0, 0x7fff7fff -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fabs_v4f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s1, 0x7fff7fff -; VI-NEXT: s_and_b32 s0, s0, 0x7fff7fff -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_fabs_v4f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s1, 0x7fff7fff -; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff7fff -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff7fff +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) store <4 x half> %fabs, <4 x half> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -45,8 +45,8 @@ ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_bitset0_b32 -; GCN: s_bitset0_b32 +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -70,11 +70,11 @@ } ; GCN-LABEL: {{^}}fabs_fn_fold: -; SI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x2c +; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9 +; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24 ; GCN-NOT: and -; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]] +; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]] +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]] define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @fabs(float %in0) %fmul = fmul float %fabs, %in1 @@ -83,11 +83,11 @@ } ; FUNC-LABEL: {{^}}fabs_fold: -; SI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0xb -; VI: s_load_dwordx2 s[[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]], s[{{[0-9]+:[0-9]+}}], 0x2c +; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x9 +; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[{{[0-9]+:[0-9]+}}], 0x24 ; GCN-NOT: and -; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]] -; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]] +; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[#LOAD + 3]] +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[#LOAD + 2]]|, [[V_MUL_VI]] define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @llvm.fabs.f32(float %in0) %fmul = fmul float %fabs, %in1 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -8,11 +8,11 @@ ; Try to identify arg based on higher address. ; FUNC-LABEL: {{^}}test_copysign_f32: -; SI: s_load_dwordx2 s[[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]], {{.*}} 0xb -; VI: s_load_dwordx2 s[[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]], {{.*}} 0x2c +; SI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], {{.*}} 0x9 +; VI: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], {{.*}} 0x24 -; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]] -; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]] +; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[#LOAD + 3]] +; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[#LOAD + 2]] ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]] ; GCN: buffer_store_dword [[RESULT]], diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll @@ -31,21 +31,21 @@ } ; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32: -; GCN-DAG: s_load_dwordx2 s[[[A:[0-9]+]]:[[B:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} -; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] +; SI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]] -; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] +; GCN-NONAN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] -; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]] +; VI-SAFE: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] -; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[B]], [[VA]] +; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, s[[#LOAD + 3]], [[VA]] -; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[A]] -; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[A]], [[VB]] +; VI-SAFE: v_mov_b32_e32 [[VA:v[0-9]+]], s[[#LOAD + 2]] +; VI-SAFE: v_cmp_ngt_f32_e32 vcc, s[[#LOAD + 2]], [[VB]] ; VI-SAFE: v_cndmask_b32_e32 v{{[0-9]+}}, [[VB]], [[VA]] -; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]] +; GCN-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[#LOAD + 2]], [[VB]] define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 { %cmp = fcmp ule float %a, %b %val = select i1 %cmp, float %a, float %b @@ -56,10 +56,10 @@ ; Nsz also needed ; FIXME: Should separate tests ; GCN-LABEL: {{^}}s_test_fmin_legacy_ule_f32_nnan_src: -; GCN: s_load_dwordx2 s[[[A:[0-9]+]]:[[B:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} -; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[A]], 1.0 -; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[B]], 2.0 +; GCN-DAG: v_add_f32_e64 [[ADD_A:v[0-9]+]], s[[#LOAD + 2]], 1.0 +; GCN-DAG: v_add_f32_e64 [[ADD_B:v[0-9]+]], s[[#LOAD + 3]], 2.0 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[ADD_B]], [[ADD_A]] diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -162,11 +162,10 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) @@ -187,11 +186,10 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -569,151 +569,131 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s0, s6 +; GFX7-NEXT: s_mov_b32 s1, s7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; GFX1030-NEXT: v_mov_b32_e32 v1, s3 -; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1030-NEXT: global_store_dword v1, v0, s[6:7] ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_clause 0x2 -; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc +; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; G_SI-NEXT: v_mov_b32_e32 v0, s4 +; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; G_SI-NEXT: s_mov_b32 s2, -1 ; G_SI-NEXT: s_mov_b32 s3, 0xf000 +; G_SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX7-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX7-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; G_GFX7-NEXT: s_mov_b32 s2, -1 ; G_GFX7-NEXT: s_mov_b32 s3, 0xf000 +; G_GFX7-NEXT: s_mov_b64 s[0:1], s[6:7] ; G_GFX7-NEXT: s_waitcnt vmcnt(0) ; G_GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; G_GFX7-NEXT: s_endpgm ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_clause 0x1 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 -; G_GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; G_GFX10-NEXT: s_waitcnt vmcnt(0) +; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc ; G_GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) -; G_GFX1030-NEXT: global_store_dword v1, v0, s[0:1] +; G_GFX1030-NEXT: global_store_dword v1, v0, s[6:7] ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_clause 0x2 -; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 -; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; G_GFX1100-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c +; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) -; G_GFX1100-NEXT: global_store_b32 v1, v0, s[0:1] +; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm main_body: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -424,26 +424,21 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) { ; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; SI: ; %bb.0: ; %main_body -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; SI-NEXT: s_load_dword s8, s[0:1], 0xf -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0x10 +; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_mov_b32_e32 v2, s8 -; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc +; SI-NEXT: v_mov_b32_e32 v2, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: ds_write_b64 v2, v[0:1] ; SI-NEXT: s_endpgm ; ; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX7: ; %bb.0: ; %main_body -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xf -; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -457,26 +452,20 @@ ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v2, s8 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; GFX1030-NEXT: v_mov_b32_e32 v1, s5 @@ -489,26 +478,21 @@ ; ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_SI: ; %bb.0: ; %main_body -; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; G_SI-NEXT: s_load_dword s8, s[0:1], 0xf -; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x10 +; G_SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v1, s3 -; G_SI-NEXT: v_mov_b32_e32 v2, s8 -; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; G_SI-NEXT: v_mov_b32_e32 v2, s0 +; G_SI-NEXT: v_mov_b32_e32 v0, s4 +; G_SI-NEXT: v_mov_b32_e32 v1, s5 +; G_SI-NEXT: v_mov_b32_e32 v2, s6 +; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc +; G_SI-NEXT: v_mov_b32_e32 v2, s7 ; G_SI-NEXT: s_waitcnt vmcnt(0) ; G_SI-NEXT: ds_write_b64 v2, v[0:1] ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX7: ; %bb.0: ; %main_body -; G_GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xf -; G_GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -522,26 +506,20 @@ ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_clause 0x2 -; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; G_GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c -; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s2 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s3 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s8 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s9 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s7 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_clause 0x2 -; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c -; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -87,26 +87,28 @@ define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { ; SI-LABEL: fp_to_sint_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_i32_f32_e32 v1, s5 -; SI-NEXT: v_cvt_i32_f32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_i32_f32_e32 v1, s3 +; SI-NEXT: v_cvt_i32_f32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_i32_f32_e32 v1, s3 ; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i32: @@ -292,25 +294,26 @@ define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { ; SI-LABEL: fp_to_sint_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, 0x2f800000 -; SI-NEXT: s_mov_b32 s3, 0xcf800000 +; SI-NEXT: s_mov_b32 s8, 0x2f800000 +; SI-NEXT: s_mov_b32 s9, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s1 -; SI-NEXT: v_trunc_f32_e32 v1, s0 -; SI-NEXT: v_mul_f32_e64 v2, |v0|, s2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_trunc_f32_e32 v0, s3 +; SI-NEXT: v_trunc_f32_e32 v1, s2 +; SI-NEXT: v_mul_f32_e64 v2, |v0|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; SI-NEXT: v_mul_f32_e64 v4, |v1|, s2 +; SI-NEXT: v_mul_f32_e64 v4, |v1|, s8 ; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1 ; SI-NEXT: v_floor_f32_e32 v2, v2 ; SI-NEXT: v_floor_f32_e32 v4, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v6, v2 -; SI-NEXT: v_fma_f32 v0, v2, s3, |v0| +; SI-NEXT: v_fma_f32 v0, v2, s9, |v0| ; SI-NEXT: v_cvt_u32_f32_e32 v2, v4 -; SI-NEXT: v_fma_f32 v1, v4, s3, |v1| +; SI-NEXT: v_fma_f32 v1, v4, s9, |v1| ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_xor_b32_e32 v4, v6, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -326,23 +329,24 @@ ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0x2f800000 -; VI-NEXT: s_mov_b32 s7, 0xcf800000 -; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s8, 0x2f800000 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s5 -; VI-NEXT: v_mul_f32_e64 v1, |v0|, s6 +; VI-NEXT: v_trunc_f32_e32 v0, s3 +; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: v_fma_f32 v2, v1, s7, |v0| -; VI-NEXT: v_trunc_f32_e32 v4, s4 +; VI-NEXT: s_mov_b32 s0, 0xcf800000 +; VI-NEXT: v_fma_f32 v2, v1, s0, |v0| +; VI-NEXT: v_trunc_f32_e32 v4, s2 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 -; VI-NEXT: v_mul_f32_e64 v3, |v4|, s6 +; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; VI-NEXT: v_floor_f32_e32 v3, v3 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 -; VI-NEXT: v_fma_f32 v3, v3, s7, |v4| +; VI-NEXT: v_fma_f32 v3, v3, s0, |v4| ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v6, v3 ; VI-NEXT: v_xor_b32_e32 v2, v2, v0 @@ -353,9 +357,9 @@ ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -47,26 +47,28 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_u32_f32_e32 v1, s5 -; SI-NEXT: v_cvt_u32_f32_e32 v0, s4 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cvt_u32_f32_e32 v1, s3 +; SI-NEXT: v_cvt_u32_f32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_u32_f32_e32 v1, s3 ; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i32: @@ -238,22 +240,23 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) { ; SI-LABEL: fp_to_uint_v2f32_to_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s2, 0xcf800000 +; SI-NEXT: s_mov_b32 s8, 0xcf800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_trunc_f32_e32 v0, s1 -; SI-NEXT: v_trunc_f32_e32 v2, s0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_trunc_f32_e32 v0, s3 +; SI-NEXT: v_trunc_f32_e32 v2, s2 ; SI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_floor_f32_e32 v4, v1 ; SI-NEXT: v_floor_f32_e32 v5, v3 ; SI-NEXT: v_cvt_u32_f32_e32 v3, v4 ; SI-NEXT: v_cvt_u32_f32_e32 v1, v5 -; SI-NEXT: v_fma_f32 v0, v4, s2, v0 -; SI-NEXT: v_fma_f32 v4, v5, s2, v2 +; SI-NEXT: v_fma_f32 v0, v4, s8, v0 +; SI-NEXT: v_fma_f32 v4, v5, s8, v2 ; SI-NEXT: v_cvt_u32_f32_e32 v2, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v4 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -261,8 +264,9 @@ ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_trunc_f32_e32 v0, s3 ; VI-NEXT: v_trunc_f32_e32 v4, s2 @@ -277,9 +281,9 @@ ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -13,49 +13,48 @@ define amdgpu_kernel void @fshl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshl_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_alignbit_b32 v0, s2, v0, 1 -; SI-NEXT: s_not_b32 s0, s0 -; SI-NEXT: s_lshr_b32 s1, s2, 1 -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: s_not_b32 s5, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; SI-NEXT: s_lshr_b32 s4, s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_not_b32 s4, s4 -; VI-NEXT: s_lshr_b32 s3, s2, 1 -; VI-NEXT: v_alignbit_b32 v0, s2, v0, 1 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_not_b32 s0, s0 +; VI-NEXT: s_lshr_b32 s1, s6, 1 +; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_lshr_b32 s0, s2, 1 -; GFX9-NEXT: s_not_b32 s1, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_not_b32 s1, s2 +; GFX9-NEXT: s_lshr_b32 s0, s6, 1 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] @@ -77,33 +76,31 @@ ; ; GFX10-LABEL: fshl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, 1 -; GFX10-NEXT: s_lshr_b32 s0, s2, 1 -; GFX10-NEXT: s_not_b32 s1, s6 +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX10-NEXT: s_lshr_b32 s0, s6, 1 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 -; GFX11-NEXT: s_not_b32 s3, s4 +; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, 1 +; GFX11-NEXT: s_lshr_b32 s1, s6, 1 +; GFX11-NEXT: s_not_b32 s0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, v0, s3 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s1, v0, s0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm entry: @@ -115,20 +112,20 @@ define amdgpu_kernel void @fshl_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; SI-LABEL: fshl_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 @@ -139,13 +136,12 @@ ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -162,20 +158,16 @@ ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -22,42 +22,41 @@ define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s6, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s6 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_alignbit_b32 v0, s6, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_alignbit_b32 v2, s6, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dword s6, s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -74,23 +73,21 @@ ; ; GFX10-LABEL: fshr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s6, s[0:1], 0x34 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-NEXT: v_alignbit_b32 v0, s2, s3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbit_b32 v0, s6, s7, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -107,20 +104,20 @@ define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) { ; SI-LABEL: fshr_i32_imm: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_alignbit_b32 v0, s4, v0, 7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_alignbit_b32 v0, s2, v0, 7 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 @@ -131,13 +128,12 @@ ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -154,20 +150,16 @@ ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -57,39 +57,21 @@ } define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { -; CI-LABEL: load_v3f16_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s4, s0, 4 -; CI-NEXT: s_addc_u32 s5, s1, 0 -; CI-NEXT: v_mov_b32_e32 v2, s4 -; CI-NEXT: v_mov_b32_e32 v4, s3 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v3, s5 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v5, s2 -; CI-NEXT: flat_store_short v[2:3], v4 -; CI-NEXT: flat_store_dword v[0:1], v5 -; CI-NEXT: s_endpgm -; -; VI-LABEL: load_v3f16_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 -; VI-NEXT: flat_store_short v[2:3], v4 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: s_endpgm +; GCN-LABEL: load_v3f16_arg: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s4, s0, 4 +; GCN-NEXT: s_addc_u32 s5, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v5, s2 +; GCN-NEXT: flat_store_short v[2:3], v4 +; GCN-NEXT: flat_store_dword v[0:1], v5 +; GCN-NEXT: s_endpgm store <3 x half> %arg, <3 x half> addrspace(1)* %out ret void } @@ -97,29 +79,16 @@ ; FIXME: Why not one load? define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { -; CI-LABEL: load_v4f16_arg: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; CI-NEXT: s_endpgm -; -; VI-LABEL: load_v4f16_arg: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] -; VI-NEXT: s_endpgm +; GCN-LABEL: load_v4f16_arg: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: s_endpgm store <4 x half> %arg, <4 x half> addrspace(1)* %out ret void } @@ -248,29 +217,27 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v4, s3 -; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v3f16_to_v3f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 ; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] ; VI-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x float> @@ -281,33 +248,31 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f32_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s1, 16 -; CI-NEXT: s_lshr_b32 s5, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; CI-NEXT: s_lshr_b32 s4, s3, 16 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v4f16_to_v4f32_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s1, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x float> @@ -447,45 +412,43 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { ; CI-LABEL: extload_v3f16_to_v3f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 -; CI-NEXT: s_lshr_b32 s4, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; CI-NEXT: s_add_u32 s0, s2, 16 +; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; CI-NEXT: s_addc_u32 s1, s3, 0 +; CI-NEXT: s_addc_u32 s3, s1, 0 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_mov_b32_e32 v7, s1 -; CI-NEXT: v_mov_b32_e32 v6, s0 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: v_mov_b32_e32 v6, s2 ; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v3f16_to_v3f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s4, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: v_mov_b32_e32 v7, s1 -; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %ext = fpext <3 x half> %arg to <3 x double> @@ -496,54 +459,52 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { ; CI-LABEL: extload_v4f16_to_v4f64_arg: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s4, s1, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; CI-NEXT: s_lshr_b32 s5, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 -; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; CI-NEXT: s_add_u32 s0, s2, 16 -; CI-NEXT: s_addc_u32 s1, s3, 0 +; CI-NEXT: s_lshr_b32 s4, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CI-NEXT: s_lshr_b32 s5, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s5 +; CI-NEXT: s_add_u32 s2, s0, 16 ; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; CI-NEXT: v_mov_b32_e32 v9, s1 -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; CI-NEXT: v_mov_b32_e32 v9, s3 +; CI-NEXT: v_mov_b32_e32 v8, s2 +; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; CI-NEXT: s_nop 0 -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CI-NEXT: s_endpgm ; ; VI-LABEL: extload_v4f16_to_v4f64_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s5, s1, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 -; VI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; VI-NEXT: s_lshr_b32 s4, s0, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s3 +; VI-NEXT: v_cvt_f32_f16_e32 v2, s5 +; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 +; VI-NEXT: v_cvt_f32_f16_e32 v6, s4 +; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_mov_b32_e32 v8, s0 -; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; VI-NEXT: v_mov_b32_e32 v9, s3 +; VI-NEXT: v_mov_b32_e32 v8, s2 +; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; VI-NEXT: s_nop 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; VI-NEXT: s_endpgm %ext = fpext <4 x half> %arg to <4 x double> store <4 x double> %ext, <4 x double> addrspace(1)* %out @@ -1897,22 +1858,20 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { ; CI-LABEL: fadd_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s2, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s1, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s0 -; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; CI-NEXT: s_lshr_b32 s4, s2, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: s_lshr_b32 s2, s3, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s2 ; CI-NEXT: v_add_f32_e32 v0, v0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_add_f32_e32 v1, v2, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v2, v0, v1 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -1920,19 +1879,18 @@ ; ; VI-LABEL: fadd_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s1, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: s_lshr_b32 s4, s3, 16 +; VI-NEXT: s_lshr_b32 s5, s2, 16 +; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: v_add_f16_e32 v0, s2, v0 ; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v0, s0, v0 ; VI-NEXT: v_or_b32_e32 v2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %add = fadd <2 x half> %a, %b @@ -2007,34 +1965,33 @@ define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { ; CI-LABEL: fadd_v8f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 -; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 +; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s10, s0, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 -; CI-NEXT: s_lshr_b32 s0, s5, 16 -; CI-NEXT: s_lshr_b32 s11, s1, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 -; CI-NEXT: s_lshr_b32 s10, s2, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; CI-NEXT: s_lshr_b32 s0, s6, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 -; CI-NEXT: s_lshr_b32 s10, s3, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 -; CI-NEXT: s_lshr_b32 s0, s7, 16 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 -; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 -; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 -; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 +; CI-NEXT: s_lshr_b32 s2, s8, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 +; CI-NEXT: s_lshr_b32 s2, s11, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s2 +; CI-NEXT: s_lshr_b32 s2, s12, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 +; CI-NEXT: s_lshr_b32 s2, s13, 16 +; CI-NEXT: s_lshr_b32 s3, s9, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v9, s2 +; CI-NEXT: s_lshr_b32 s2, s14, 16 +; CI-NEXT: s_lshr_b32 s4, s10, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; CI-NEXT: v_cvt_f32_f16_e32 v10, s2 +; CI-NEXT: s_lshr_b32 s2, s15, 16 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; CI-NEXT: v_cvt_f32_f16_e32 v5, s9 +; CI-NEXT: v_cvt_f32_f16_e32 v11, s2 +; CI-NEXT: v_cvt_f32_f16_e32 v12, s12 +; CI-NEXT: v_cvt_f32_f16_e32 v13, s13 +; CI-NEXT: v_cvt_f32_f16_e32 v6, s10 +; CI-NEXT: v_cvt_f32_f16_e32 v7, s11 +; CI-NEXT: v_cvt_f32_f16_e32 v14, s15 +; CI-NEXT: v_cvt_f32_f16_e32 v15, s14 ; CI-NEXT: v_add_f32_e32 v1, v1, v9 ; CI-NEXT: v_add_f32_e32 v0, v0, v8 ; CI-NEXT: v_add_f32_e32 v3, v3, v11 @@ -2057,53 +2014,52 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v5, v1 ; CI-NEXT: v_or_b32_e32 v0, v4, v0 -; CI-NEXT: v_mov_b32_e32 v4, s8 +; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_or_b32_e32 v3, v7, v3 ; CI-NEXT: v_or_b32_e32 v2, v6, v2 -; CI-NEXT: v_mov_b32_e32 v5, s9 +; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: fadd_v8f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s3, 16 -; VI-NEXT: s_lshr_b32 s7, s11, 16 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: s_lshr_b32 s2, s15, 16 +; VI-NEXT: s_lshr_b32 s3, s11, 16 +; VI-NEXT: v_mov_b32_e32 v0, s15 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, s11, v0 +; VI-NEXT: s_lshr_b32 s2, s14, 16 +; VI-NEXT: s_lshr_b32 s3, s10, 16 +; VI-NEXT: v_or_b32_e32 v3, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_f16_e32 v1, s11, v1 -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: s_lshr_b32 s6, s10, 16 -; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_mov_b32_e32 v1, s14 ; VI-NEXT: v_add_f16_e32 v1, s10, v1 -; VI-NEXT: s_lshr_b32 s2, s1, 16 +; VI-NEXT: s_lshr_b32 s2, s13, 16 ; VI-NEXT: s_lshr_b32 s3, s9, 16 ; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: v_add_f16_e32 v1, s9, v1 -; VI-NEXT: s_lshr_b32 s1, s0, 16 -; VI-NEXT: s_lshr_b32 s2, s8, 16 +; VI-NEXT: s_lshr_b32 s2, s12, 16 +; VI-NEXT: s_lshr_b32 s3, s8, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s1 -; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_add_f16_e32 v4, s8, v4 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %add = fadd <8 x half> %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll --- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll @@ -11,18 +11,17 @@ ; GFX8V3-LABEL: addrspacecast: ; GFX8V3: ; %bb.0: ; GFX8V3-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V3-NEXT: s_load_dword s2, s[4:5], 0x44 -; GFX8V3-NEXT: s_load_dword s3, s[4:5], 0x40 +; GFX8V3-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V3-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V3-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V3-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V3-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V3-NEXT: v_mov_b32_e32 v0, s3 ; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8V3-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8V3-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V3-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8V3-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V3-NEXT: v_mov_b32_e32 v2, s2 ; GFX8V3-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8V3-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; GFX8V3-NEXT: v_mov_b32_e32 v2, s1 @@ -37,18 +36,17 @@ ; GFX8V4-LABEL: addrspacecast: ; GFX8V4: ; %bb.0: ; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX8V4-NEXT: s_load_dword s2, s[4:5], 0x44 -; GFX8V4-NEXT: s_load_dword s3, s[4:5], 0x40 +; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x40 ; GFX8V4-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V4-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1 -; GFX8V4-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V4-NEXT: v_mov_b32_e32 v0, s3 ; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8V4-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GFX8V4-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1 ; GFX8V4-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX8V4-NEXT: v_mov_b32_e32 v2, s3 +; GFX8V4-NEXT: v_mov_b32_e32 v2, s2 ; GFX8V4-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8V4-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; GFX8V4-NEXT: v_mov_b32_e32 v2, s1 @@ -63,17 +61,16 @@ ; GFX8V5-LABEL: addrspacecast: ; GFX8V5: ; %bb.0: ; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX8V5-NEXT: s_load_dword s2, s[4:5], 0xc8 -; GFX8V5-NEXT: s_load_dword s3, s[4:5], 0xcc +; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0xc8 ; GFX8V5-NEXT: v_mov_b32_e32 v4, 1 ; GFX8V5-NEXT: s_waitcnt lgkmcnt(0) ; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1 ; GFX8V5-NEXT: v_mov_b32_e32 v0, s2 +; GFX8V5-NEXT: v_mov_b32_e32 v2, s0 ; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX8V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GFX8V5-NEXT: v_mov_b32_e32 v0, s0 ; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1 -; GFX8V5-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8V5-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GFX8V5-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX8V5-NEXT: v_mov_b32_e32 v2, s3 ; GFX8V5-NEXT: s_cselect_b64 vcc, -1, 0 ; GFX8V5-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -86,8 +86,7 @@ ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, 1 ; GCN-NEXT: v_mov_b32_e32 v0, s3 @@ -290,22 +289,21 @@ define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GCN-NEXT: s_load_dword s7, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 0xffff -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 ; GCN-NEXT: s_mov_b32 s6, 0x3c003c00 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s7, s7, 4 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s7 ; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -317,15 +315,13 @@ define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x30 -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 4 -; GCN-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GCN-NEXT: s_andn2_b32 s3, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0x3c003c00 -; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_lshl_b32 s3, s3, 4 +; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 +; GCN-NEXT: s_andn2_b32 s2, s2, s3 +; GCN-NEXT: s_and_b32 s3, s3, 0x3c003c00 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -402,15 +398,13 @@ define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s2, s[0:1], 0x30 -; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s2, 4 -; GCN-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GCN-NEXT: s_andn2_b32 s3, s3, s2 -; GCN-NEXT: s_and_b32 s2, s2, 0x10001 -; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_lshl_b32 s3, s3, 4 +; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 +; GCN-NEXT: s_andn2_b32 s2, s2, s3 +; GCN-NEXT: s_and_b32 s3, s3, 0x10001 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -425,22 +419,21 @@ define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GCN-NEXT: s_load_dword s7, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 0xffff -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 ; GCN-NEXT: s_mov_b32 s6, 0x10001 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s7, s7, 4 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s7 ; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -453,20 +446,19 @@ ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_mov_b64 s[4:5], 0xffff ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s6, s6, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 ; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 -; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -986,10 +978,9 @@ ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_mov_b32 s7, 0xe80000 ; GCN-NEXT: s_add_u32 s4, s4, s3 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 4 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s3, s3, 3 ; GCN-NEXT: v_mov_b32_e32 v1, s2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -9,26 +9,28 @@ define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2f32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 @@ -38,26 +40,28 @@ define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { ; SI-LABEL: insertelement_v2f32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2f32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 @@ -67,26 +71,28 @@ define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_0: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2i32_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 999, i32 0 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 @@ -96,26 +102,28 @@ define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { ; SI-LABEL: insertelement_v2i32_1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: insertelement_v2i32_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 999, i32 1 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 @@ -397,42 +405,44 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 -; SI-NEXT: s_mov_b32 s3, 0x100f000 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s6, 1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_cmp_lg_u32 s8, 1 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: s_cmp_lg_u32 s6, 0 +; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_lg_u32 s6, 1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_cmp_lg_u32 s8, 1 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_cmp_lg_u32 s6, 0 +; VI-NEXT: s_cmp_lg_u32 s8, 0 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 @@ -678,38 +688,40 @@ define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i32: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_load_dword s8, s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: s_cmp_lg_u32 s8, 1 -; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc -; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i32: ; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s8, s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_cmp_lg_u32 s8, 1 -; VI-NEXT: s_cselect_b32 s4, s7, 5 +; VI-NEXT: s_cselect_b32 s0, s3, 5 ; VI-NEXT: s_cmp_lg_u32 s8, 0 -; VI-NEXT: s_cselect_b32 s5, s6, 5 -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_cselect_b32 s1, s2, 5 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i32> %a, i32 5, i32 %b store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 @@ -943,36 +955,36 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x3 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0x2 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s5, s6, 4 -; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 -; SI-NEXT: s_andn2_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s5, 0x50005 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s3, 4 +; SI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_andn2_b32 s1, s2, s0 +; SI-NEXT: s_and_b32 s0, s0, 0x50005 +; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0xc -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x8 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s5, s6, 4 -; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 -; VI-NEXT: s_andn2_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s5, 0x50005 -; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s3, 4 +; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_andn2_b32 s1, s2, s0 +; VI-NEXT: s_and_b32 s0, s0, 0x50005 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <2 x i16> %a, i16 5, i32 %b store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 @@ -982,45 +994,47 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s6, s[4:5], 0x4 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 -; SI-NEXT: s_mov_b32 s3, 0x100f000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; SI-NEXT: s_load_dword s8, s[4:5], 0x4 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s8, s6, 4 -; SI-NEXT: s_mov_b64 s[6:7], 0xffff -; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; SI-NEXT: s_and_b32 s9, s7, 0x50005 -; SI-NEXT: s_and_b32 s8, s6, 0x50005 -; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_lshl_b32 s8, s8, 4 +; SI-NEXT: s_mov_b64 s[0:1], 0xffff +; SI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 +; SI-NEXT: s_and_b32 s9, s1, 0x50005 +; SI-NEXT: s_and_b32 s8, s0, 0x50005 +; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v3i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; VI-NEXT: s_load_dword s8, s[4:5], 0x10 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s8, s6, 4 -; VI-NEXT: s_mov_b64 s[6:7], 0xffff -; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_lshl_b32 s8, s8, 4 +; VI-NEXT: s_mov_b64 s[0:1], 0xffff +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; VI-NEXT: s_mov_b32 s8, 0x50005 ; VI-NEXT: s_mov_b32 s9, s8 -; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] -; VI-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] -; VI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; VI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <3 x i16> %a, i16 5, i32 %b store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -748,37 +748,36 @@ define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind { ; SI-LABEL: v2i32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2i32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v2i32_arg: @@ -812,37 +811,36 @@ define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind { ; SI-LABEL: v2f32_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v2f32_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v2f32_arg: @@ -1015,45 +1013,44 @@ define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind { ; SI-LABEL: v3i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 4 -; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: s_add_u32 s4, s0, 4 +; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v5, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v3i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v0, v2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v3i16_arg: @@ -1396,37 +1393,36 @@ define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) { ; SI-LABEL: v4i16_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v4i16_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v4i16_arg: @@ -1696,47 +1692,44 @@ define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind { ; SI-LABEL: v5i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xc -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dword s4, s[0:1], 0x30 -; VI-NEXT: s_load_dword s5, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s2, 4 -; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s4, s0, 4 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_byte v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v5i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v0, v2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v5i8_arg: @@ -2555,37 +2548,36 @@ define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) { ; SI-LABEL: v8i8_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v8i8_arg: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: v8i8_arg: @@ -4694,53 +4686,52 @@ define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { ; SI-LABEL: i65_arg: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dword s4, s[0:1], 0xd +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s6, s2, 1 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_and_b32 s8, s4, 1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s4, 1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 8 -; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s0, s0, 8 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_store_byte v[2:3], v4 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_byte v[4:5], v6 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: i65_arg: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s4, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_byte v2, v3, s[2:3] offset:8 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_byte v2, v3, s[0:1] offset:8 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: i65_arg: @@ -5718,65 +5709,56 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; SI-LABEL: array_3xi32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[0:1], 0xc -; SI-NEXT: s_load_dword s5, s[0:1], 0x9 -; SI-NEXT: s_load_dword s6, s[0:1], 0xa -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v0, s1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_endpgm ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[0:1], 0x24 -; VI-NEXT: s_load_dword s3, s[0:1], 0x30 -; VI-NEXT: s_load_dword s4, s[0:1], 0x28 -; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: flat_store_dword v[0:1], v0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: array_3xi32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0xc -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-NEXT: s_load_dword s3, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: global_store_short v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[0:1], v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -29,8 +29,7 @@ ; FUNC-LABEL: {{^}}i65_arg: ; HSA-VI: kernarg_segment_byte_size = 24 ; HSA-VI: kernarg_segment_alignment = 4 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { entry: store i65 %in, i65 addrspace(1)* %out, align 4 @@ -113,10 +112,7 @@ } ; GCN-LABEL: {{^}}array_3xi32: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { store volatile i16 %arg0, i16 addrspace(1)* undef store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef @@ -124,8 +120,7 @@ } ; GCN-LABEL: {{^}}array_3xi16: -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, i8 addrspace(1)* undef store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef @@ -143,7 +138,7 @@ } ; GCN-LABEL: {{^}}v3i15_arg: -; GCN: s_load_dword [[DWORD:s[0-9]+]] +; GCN: s_load_dwordx4 [[DWORDX4:s\[[0-9]+:[0-9]+\]]] ; GCN: s_lshl_b64 ; GCN: s_and_b32 ; GCN: s_and_b32 @@ -179,8 +174,7 @@ ; GCN-LABEL: {{^}}byref_constant_i32_arg: ; GCN: kernarg_segment_byte_size = 16 -; GCN: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x8{{$}} -; GCN: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0xc{{$}} +; GCN: s_load_dwordx4 [[LOAD:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}} define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) { %in = load i32, i32 addrspace(4)* %in.byref store volatile i32 %in, i32 addrspace(1)* %out, align 4 @@ -202,10 +196,9 @@ ; GCN-LABEL: {{^}}byref_align_constant_i32_arg: ; GCN: kernarg_segment_byte_size = 264 -; GCN-DAG: s_load_dword [[IN:s[0-9]+]], s[4:5], 0x100{{$}} -; GCN-DAG: s_load_dword [[AFTER_OFFSET:s[0-9]+]], s[4:5], 0x104{{$}} -; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], [[IN]] -; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], [[AFTER_OFFSET]] +; GCN-DAG: s_load_dwordx2 s[[[IN:[0-9]+]]:[[AFTER_OFFSET:[0-9]+]]], s[4:5], 0x100{{$}} +; GCN-DAG: v_mov_b32_e32 [[V_IN:v[0-9]+]], s[[IN]] +; GCN-DAG: v_mov_b32_e32 [[V_AFTER_OFFSET:v[0-9]+]], s[[AFTER_OFFSET]] ; GCN: global_store_dword v{{[0-9]+}}, [[V_IN]], s ; GCN: global_store_dword v{{[0-9]+}}, [[V_AFTER_OFFSET]], s define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) { @@ -263,9 +256,7 @@ ; GCN-LABEL: {{^}}multi_byref_constant_i32_arg: ; GCN: kernarg_segment_byte_size = 20 -; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x8 -; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0xc -; GCN: s_load_dword {{s[0-9]+}}, s[4:5], 0x10 +; GCN: s_load_dwordx4 {{s\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) { %in0 = load i32, i32 addrspace(4)* %in0.byref %in1 = load i32, i32 addrspace(4)* %in1.byref diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -22,28 +22,27 @@ ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s3, s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 4 +; SI-NEXT: s_lshl_b32 s3, s2, 4 ; SI-NEXT: s_lshl_b32 s2, s2, 3 ; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; SI-NEXT: s_add_i32 s1, s1, 64 -; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: s_add_i32 s2, s3, 64 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: ds_min_f32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_min_rtn_f32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; SI-NEXT: s_endpgm @@ -158,26 +157,25 @@ ; G_SI-LABEL: lds_ds_fmin: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 s6, -1 ; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 ; G_SI-NEXT: s_add_u32 s4, s4, s3 ; G_SI-NEXT: s_addc_u32 s5, s5, 0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: s_add_i32 s2, s2, 4 -; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 -; G_SI-NEXT: s_lshl_b32 s1, s2, 3 +; G_SI-NEXT: s_lshl_b32 s3, s2, 3 ; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f32 v1, v1, v0 -; G_SI-NEXT: s_lshl_b32 s1, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v2, s1 +; G_SI-NEXT: s_lshl_b32 s2, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s2 ; G_SI-NEXT: ds_min_f32 v2, v0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s1 +; G_SI-NEXT: s_waitcnt lgkmcnt(1) ; G_SI-NEXT: ds_min_rtn_f32 v0, v0, v1 ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) @@ -310,28 +308,27 @@ ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_load_dword s2, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s7, 0xe8f000 ; SI-NEXT: s_add_u32 s4, s4, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s3, s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s5, s5, 0 -; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s1, s2, 4 +; SI-NEXT: s_lshl_b32 s3, s2, 4 ; SI-NEXT: s_lshl_b32 s2, s2, 3 ; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 ; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; SI-NEXT: s_add_i32 s1, s1, 64 -; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: s_add_i32 s2, s3, 64 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: ds_max_f32 v2, v0 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_max_rtn_f32 v0, v0, v1 -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen ; SI-NEXT: s_endpgm @@ -446,26 +443,25 @@ ; G_SI-LABEL: lds_ds_fmax: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 s6, -1 ; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 ; G_SI-NEXT: s_add_u32 s4, s4, s3 ; G_SI-NEXT: s_addc_u32 s5, s5, 0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: s_add_i32 s2, s2, 4 -; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 -; G_SI-NEXT: s_lshl_b32 s1, s2, 3 +; G_SI-NEXT: s_lshl_b32 s3, s2, 3 ; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 -; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v1, s3 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f32 v1, v1, v0 -; G_SI-NEXT: s_lshl_b32 s1, s2, 4 -; G_SI-NEXT: v_mov_b32_e32 v2, s1 +; G_SI-NEXT: s_lshl_b32 s2, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s2 ; G_SI-NEXT: ds_max_f32 v2, v0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: v_mov_b32_e32 v0, s1 +; G_SI-NEXT: s_waitcnt lgkmcnt(1) ; G_SI-NEXT: ds_max_rtn_f32 v0, v0, v1 ; G_SI-NEXT: v_mov_b32_e32 v1, s0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) @@ -598,36 +594,35 @@ ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s3, s[0:1], 0x9 -; SI-NEXT: s_load_dword s4, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s0, s2, 3 -; SI-NEXT: s_lshl_b32 s5, s2, 4 -; SI-NEXT: s_add_i32 s2, s0, 32 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, 0x40450000 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_lshl_b32 s5, s4, 4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_mov_b32 s3, 0x40450000 +; SI-NEXT: s_add_i32 s4, s4, 32 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: s_add_i32 s0, s5, 64 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: s_add_i32 s2, s5, 64 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: ds_min_f64 v4, v[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] -; SI-NEXT: s_add_i32 s0, s3, 4 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_add_i32 s1, s0, 4 +; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen ; SI-NEXT: s_endpgm ; @@ -761,37 +756,35 @@ ; G_SI-LABEL: lds_ds_fmin_f64: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 s10, -1 ; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 ; G_SI-NEXT: s_add_u32 s8, s8, s3 -; G_SI-NEXT: s_addc_u32 s9, s9, 0 ; G_SI-NEXT: s_mov_b32 s2, 0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s4, s4, 4 +; G_SI-NEXT: s_addc_u32 s9, s9, 0 ; G_SI-NEXT: s_mov_b32 s3, 0x40450000 -; G_SI-NEXT: s_load_dword s5, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 -; G_SI-NEXT: s_lshl_b32 s1, s4, 3 ; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v2, s1 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: s_add_i32 s4, s4, 4 ; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: s_lshl_b32 s2, s4, 3 +; G_SI-NEXT: v_mov_b32_e32 v2, s2 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] -; G_SI-NEXT: s_lshl_b32 s1, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v4, s1 +; G_SI-NEXT: s_lshl_b32 s2, s4, 4 +; G_SI-NEXT: v_mov_b32_e32 v4, s2 ; G_SI-NEXT: ds_min_f64 v4, v[0:1] -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s1 +; G_SI-NEXT: s_waitcnt lgkmcnt(1) ; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] ; G_SI-NEXT: v_mov_b32_e32 v2, s0 ; G_SI-NEXT: s_add_u32 s0, s0, 4 +; G_SI-NEXT: v_mov_b32_e32 v3, s0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_SI-NEXT: s_waitcnt expcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; G_SI-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: lds_ds_fmin_f64: @@ -941,36 +934,35 @@ ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_mov_b32 s11, 0xe8f000 ; SI-NEXT: s_add_u32 s8, s8, s3 -; SI-NEXT: s_load_dword s2, s[0:1], 0xb -; SI-NEXT: s_load_dword s3, s[0:1], 0x9 -; SI-NEXT: s_load_dword s4, s[0:1], 0xa ; SI-NEXT: s_addc_u32 s9, s9, 0 -; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s0, s2, 3 -; SI-NEXT: s_lshl_b32 s5, s2, 4 -; SI-NEXT: s_add_i32 s2, s0, 32 -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, 0x40450000 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 -; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: s_lshl_b32 s5, s4, 4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_mov_b32 s3, 0x40450000 +; SI-NEXT: s_add_i32 s4, s4, 32 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; SI-NEXT: s_add_i32 s0, s5, 64 -; SI-NEXT: v_mov_b32_e32 v4, s0 +; SI-NEXT: s_add_i32 s2, s5, 64 +; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: ds_max_f64 v4, v[0:1] -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] -; SI-NEXT: s_add_i32 s0, s3, 4 -; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_add_i32 s1, s0, 4 +; SI-NEXT: v_mov_b32_e32 v2, s1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen ; SI-NEXT: s_endpgm ; @@ -1104,37 +1096,35 @@ ; G_SI-LABEL: lds_ds_fmax_f64: ; G_SI: ; %bb.0: ; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb ; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_load_dword s4, s[0:1], 0xb +; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; G_SI-NEXT: s_mov_b32 s10, -1 ; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 ; G_SI-NEXT: s_add_u32 s8, s8, s3 -; G_SI-NEXT: s_addc_u32 s9, s9, 0 ; G_SI-NEXT: s_mov_b32 s2, 0 -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: s_add_i32 s4, s4, 4 +; G_SI-NEXT: s_addc_u32 s9, s9, 0 ; G_SI-NEXT: s_mov_b32 s3, 0x40450000 -; G_SI-NEXT: s_load_dword s5, s[0:1], 0xa -; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 -; G_SI-NEXT: s_lshl_b32 s1, s4, 3 ; G_SI-NEXT: v_mov_b32_e32 v0, s2 -; G_SI-NEXT: v_mov_b32_e32 v2, s1 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: s_add_i32 s4, s4, 4 ; G_SI-NEXT: v_mov_b32_e32 v1, s3 +; G_SI-NEXT: s_lshl_b32 s2, s4, 3 +; G_SI-NEXT: v_mov_b32_e32 v2, s2 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] -; G_SI-NEXT: s_lshl_b32 s1, s4, 4 -; G_SI-NEXT: v_mov_b32_e32 v4, s1 +; G_SI-NEXT: s_lshl_b32 s2, s4, 4 +; G_SI-NEXT: v_mov_b32_e32 v4, s2 ; G_SI-NEXT: ds_max_f64 v4, v[0:1] -; G_SI-NEXT: s_waitcnt lgkmcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s1 +; G_SI-NEXT: s_waitcnt lgkmcnt(1) ; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] ; G_SI-NEXT: v_mov_b32_e32 v2, s0 ; G_SI-NEXT: s_add_u32 s0, s0, 4 +; G_SI-NEXT: v_mov_b32_e32 v3, s0 ; G_SI-NEXT: s_waitcnt lgkmcnt(0) ; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen -; G_SI-NEXT: s_waitcnt expcnt(0) -; G_SI-NEXT: v_mov_b32_e32 v0, s0 -; G_SI-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; G_SI-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen ; G_SI-NEXT: s_endpgm ; ; G_GFX7-LABEL: lds_ds_fmax_f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -14,9 +14,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xy: -; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] ; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll @@ -4,10 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_i16_i32: -; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] -; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] -; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] +; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] +; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y) %r = bitcast <2 x i16> %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll @@ -4,10 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pk_u16_u32: -; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] -; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] -; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] +; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] +; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y) %r = bitcast <2 x i16> %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll @@ -4,10 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32: -; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] -; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] -; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] +; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] +; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y) %r = bitcast <2 x i16> %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll @@ -4,10 +4,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32: -; GCN-DAG: s_load_dwordx2 s[[[SX:[0-9]+]]:[[SY:[0-9]+]]], s[0:1], 0x{{b|2c}} -; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]] -; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]] -; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x{{9|24}} +; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[#LOAD + 3]] +; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] +; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[#LOAD + 2]], [[VY]] define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y) %r = bitcast <2 x i16> %result to i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -8,20 +8,20 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { ; SI-LABEL: s_cvt_pkrtz_v2f16_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 @@ -32,31 +32,26 @@ ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -304,6 +304,8 @@ ; GCN: s_cselect_b64 s[[C1:\[[0-9]+:[0-9]+\]]], -1, 0 ; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]] ; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1 +; SI-NEXT: s_mov_b32 +; SI-NEXT: s_mov_b32 ; GCN-NEXT: v_mov_b32_e32 ; GCN-NEXT: v_mov_b32_e32 ; GCN: {{global|flat|buffer}}_store_dwordx2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,19 +163,18 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s4, s4, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s4, s5, 0, s4 -; GFX1013-NEXT: v_add_co_u32 v4, s4, s6, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s4, s7, 0, s4 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s0, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s2, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -183,16 +182,15 @@ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 @@ -201,34 +199,33 @@ ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 -; GFX1030-NEXT: v_add_co_u32 v2, s4, s6, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 +; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[4:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 +; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 -; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 @@ -236,7 +233,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[0:3] +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -264,16 +261,15 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s4, s4, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s4, s5, 0, s4 -; GFX1013-NEXT: v_add_co_u32 v4, s4, s6, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s4, s7, 0, s4 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s0, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s2, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -281,55 +277,53 @@ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 -; GFX1030-NEXT: v_add_co_u32 v2, s4, s6, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 +; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 -; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 +; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -14,9 +14,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xy: -; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] ; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -20,18 +20,17 @@ define amdgpu_kernel void @test_sched_group_barrier_simple_pipeline(<32 x i32> addrspace(1)* noalias %in, <32 x i32> addrspace(1)* noalias %out) { ; GCN-LABEL: test_sched_group_barrier_simple_pipeline: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3] -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -69,14 +68,14 @@ ; GCN-NEXT: v_mul_lo_u32 v25, v25, v25 ; GCN-NEXT: v_mul_lo_u32 v24, v24, v24 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -14,9 +14,9 @@ ; GCN-LABEL: {{^}}buffer_store_format_d16_xy: -; UNPACKED: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] ; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -17,9 +17,9 @@ } ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: -; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; GCN: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] ; PREGFX10-UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -14,9 +14,9 @@ } ; GCN-LABEL: {{^}}tbuffer_store_d16_xy: -; GCN: s_load_dword [[S_DATA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], [[S_DATA]], 0xffff{{$}} +; GCN: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 +; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 +; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] ; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] ; UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -5,26 +5,28 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_bfe_u32 v0, v0, s5, s5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_bfe_u32 v0, v0, s3, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_bfe_u32 v0, v0, s5, s5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_bfe_u32 v0, v0, s3, s3 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -34,28 +36,30 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 { ; SI-LABEL: bfe_u32_arg_arg_imm: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfe_u32 v0, s4, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfe_u32 v0, s2, v1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_bfe_u32 v0, s4, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -65,28 +69,30 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 { ; SI-LABEL: bfe_u32_arg_imm_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfe_u32 v0, s4, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_bfe_u32 v0, s4, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -96,30 +102,32 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 { ; SI-LABEL: bfe_u32_imm_arg_arg: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_movk_i32 s6, 0x7b +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_movk_i32 s8, 0x7b ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_bfe_u32 v0, s6, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_bfe_u32 v0, s8, v0, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_movk_i32 s6, 0x7b -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s8, 0x7b +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_bfe_u32 v0, s6, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_bfe_u32 v0, s8, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_u32, i32 addrspace(1)* %out, align 4 @@ -1622,28 +1630,30 @@ define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: v_lshr_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshr_b32 s2, s4, s5 -; SI-NEXT: s_and_b32 s4, s2, 7 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_lshr_b32 s2, s2, s3 +; SI-NEXT: s_and_b32 s2, s2, 7 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s4, s5 -; VI-NEXT: s_and_b32 s4, s4, 7 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshr_b32 s0, s2, s3 +; VI-NEXT: s_and_b32 s0, s0, 7 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b %d = and i32 %c, 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -6,7 +6,9 @@ ; GCN-LABEL: {{^}}dpp_test: ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8: s_nop 1 +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0 @@ -17,7 +19,9 @@ ; GCN-LABEL: {{^}}dpp_test_bc: ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} -; GFX8: s_nop 1 +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_bc(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -52,11 +52,9 @@ } ; FUNC-LABEL: {{^}}local_size_xy: -; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 -; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 -; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 -; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c -; GCN: s_mul_i32 [[VAL:s[0-9]+]], [[X]], [[Y]] +; SI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9+]]]], s[0:1], 0x6 +; VI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9+]]]], s[0:1], 0x18 +; GCN: s_mul_i32 [[VAL:s[0-9]+]], s[[X]], s[[Y]] ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) { @@ -91,11 +89,9 @@ ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 1 -; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 -; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 -; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c -; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 -; GCN: s_mul_i32 [[VAL:s[0-9]+]], [[Y]], [[Z]] +; SI-NOHSA-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x7 +; VI-NOHSA-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x1c +; GCN: s_mul_i32 [[VAL:s[0-9]+]], s[[#LOAD + 0]], s[[#LOAD + 1]] ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) { @@ -111,14 +107,12 @@ ; HSA: enable_sgpr_private_segment_buffer = 1 ; HSA: enable_sgpr_dispatch_ptr = 1 -; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6 -; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7 -; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8 -; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18 -; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c -; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20 -; GCN: s_mul_i32 [[M:s[0-9]+]], [[X]], [[Y]] -; GCN: s_add_i32 [[VAL:s[0-9]+]], [[M]], [[Z]] +; SI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s[0:1], 0x6 +; SI-NOHSA-DAG: s_load_dword s[[Z:[0-9]+]], s[0:1], 0x8 +; VI-NOHSA-DAG: s_load_dwordx2 s[[[X:[0-9]+]]:[[Y:[0-9]+]]], s[0:1], 0x18 +; VI-NOHSA-DAG: s_load_dword s[[Z:[0-9]+]], s[0:1], 0x20 +; GCN: s_mul_i32 [[M:s[0-9]+]], s[[X]], s[[Y]] +; GCN: s_add_i32 [[VAL:s[0-9]+]], [[M]], s[[Z]] ; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] ; GCN: buffer_store_dword [[VVAL]] define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,19 +8,17 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_and_b32 s4, s2, 0xffff ; VI-NEXT: s_lshr_b32 s2, s2, 16 @@ -37,38 +35,35 @@ ; ; CI-LABEL: s_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s6, s4, 0xffff -; CI-NEXT: s_lshr_b32 s4, s4, 16 -; CI-NEXT: s_lshr_b32 s7, s5, 16 -; CI-NEXT: s_lshr_b32 s4, s4, s7 -; CI-NEXT: s_lshl_b32 s4, s4, 16 -; CI-NEXT: s_lshr_b32 s5, s6, s5 -; CI-NEXT: s_or_b32 s4, s5, s4 -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_and_b32 s0, s2, 0xffff +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: s_lshr_b32 s2, s3, 16 +; CI-NEXT: s_lshr_b32 s1, s1, s2 +; CI-NEXT: s_lshl_b32 s1, s1, 16 +; CI-NEXT: s_lshr_b32 s0, s0, s3 +; CI-NEXT: s_or_b32 s0, s0, s1 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[4:5] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -634,66 +634,66 @@ define amdgpu_kernel void @mad_i64_i32_uniform(i64 addrspace(1)* %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; CI-LABEL: mad_i64_i32_uniform: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v2, s3 -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s7 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1] +; CI-NEXT: s_mov_b32 s0, s4 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; SI-LABEL: mad_i64_i32_uniform: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_mul_hi_u32 v1, s2, v0 -; SI-NEXT: s_mul_i32 s2, s2, s3 -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v2, s1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mul_hi_u32 v1, s6, v0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; GFX9-LABEL: mad_i64_i32_uniform: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_i32 s0, s2, s3 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: s_mul_i32 s0, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 ; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: mad_i64_i32_uniform: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_i32 s6, s2, s3 -; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 -; GFX11-NEXT: s_add_u32 s2, s6, s4 -; GFX11-NEXT: s_addc_u32 s3, s3, s5 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_mul_i32 s2, s6, s7 +; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7 +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -2274,8 +2274,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2288,9 +2287,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2303,9 +2300,7 @@ ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2318,22 +2313,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2342,8 +2335,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2352,44 +2344,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2403,8 +2389,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2419,9 +2404,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2438,9 +2421,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2457,23 +2438,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2484,8 +2463,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2496,11 +2474,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -2508,11 +2485,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -2520,12 +2496,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2535,12 +2509,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2557,8 +2529,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2572,9 +2543,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2589,9 +2558,7 @@ ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2606,23 +2573,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2632,8 +2597,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2643,11 +2607,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -2655,11 +2618,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -2667,12 +2629,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2681,12 +2641,10 @@ ; ; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2702,8 +2660,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2719,9 +2676,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2740,9 +2695,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2761,15 +2714,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2777,8 +2729,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2790,8 +2741,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2803,11 +2753,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -2817,11 +2766,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -2831,12 +2779,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2848,12 +2794,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2872,8 +2816,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2889,9 +2832,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2910,9 +2851,7 @@ ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2931,15 +2870,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2947,8 +2885,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2960,8 +2897,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2973,11 +2909,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -2987,11 +2922,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3001,12 +2935,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3018,12 +2950,10 @@ ; ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3042,8 +2972,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3058,9 +2987,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3077,9 +3004,7 @@ ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3096,23 +3021,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3123,8 +3046,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3135,11 +3057,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -3147,11 +3068,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -3159,12 +3079,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3174,12 +3092,10 @@ ; ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3196,8 +3112,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3212,9 +3127,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3231,9 +3144,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3250,23 +3161,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3277,8 +3186,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3289,11 +3197,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -3301,11 +3208,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -3313,12 +3219,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3328,12 +3232,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3350,8 +3252,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3367,9 +3268,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3388,9 +3287,7 @@ ; ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3409,15 +3306,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3425,8 +3321,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3438,8 +3333,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3451,11 +3345,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3465,11 +3358,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3479,12 +3371,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3496,12 +3386,10 @@ ; ; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3520,8 +3408,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3537,9 +3424,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3558,9 +3443,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3579,15 +3462,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3595,8 +3477,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3608,8 +3489,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3621,11 +3501,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3635,11 +3514,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3649,12 +3527,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3666,12 +3542,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3690,8 +3564,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3707,9 +3580,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3728,9 +3599,7 @@ ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3749,15 +3618,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3765,8 +3633,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3778,8 +3645,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3791,11 +3657,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3805,11 +3670,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3819,12 +3683,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3836,12 +3698,10 @@ ; ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3860,8 +3720,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3877,9 +3736,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3898,9 +3755,7 @@ ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3919,15 +3774,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3935,8 +3789,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3948,8 +3801,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3961,11 +3813,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3975,11 +3826,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -3989,12 +3839,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4006,12 +3854,10 @@ ; ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4030,8 +3876,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4047,9 +3892,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4068,9 +3911,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4089,15 +3930,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4105,8 +3945,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4118,8 +3957,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4131,11 +3969,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4145,11 +3982,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4159,12 +3995,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4176,12 +4010,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4200,8 +4032,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4217,9 +4048,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4238,9 +4067,7 @@ ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4259,15 +4086,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4275,8 +4101,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4288,8 +4113,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4301,11 +4125,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4315,11 +4138,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4329,12 +4151,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4346,12 +4166,10 @@ ; ; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4370,8 +4188,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4387,9 +4204,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4408,9 +4223,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4429,15 +4242,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4445,8 +4257,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4458,8 +4269,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4471,11 +4281,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4485,11 +4294,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4499,12 +4307,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4516,12 +4322,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4540,8 +4344,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4557,9 +4360,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4578,9 +4379,7 @@ ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4599,15 +4398,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4615,8 +4413,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4628,8 +4425,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4641,11 +4437,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4655,11 +4450,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -4669,12 +4463,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4686,12 +4478,10 @@ ; ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4710,8 +4500,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4728,9 +4517,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4747,9 +4534,7 @@ ; ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4766,26 +4551,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4796,8 +4579,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4808,11 +4590,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4820,11 +4601,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4832,12 +4612,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4846,12 +4624,10 @@ ; ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4869,8 +4645,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4888,9 +4663,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4909,9 +4682,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4930,26 +4701,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4961,8 +4730,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4974,11 +4742,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -4987,11 +4754,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -5000,12 +4766,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5016,12 +4780,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5041,8 +4803,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5060,9 +4821,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5081,9 +4840,7 @@ ; ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5102,27 +4859,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5134,8 +4889,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5147,11 +4901,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -5161,11 +4914,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -5175,12 +4927,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5191,12 +4941,10 @@ ; ; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5216,8 +4964,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5236,9 +4983,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5259,9 +5004,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5282,27 +5025,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5315,8 +5056,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5329,11 +5069,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -5344,11 +5083,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -5359,12 +5097,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5377,12 +5113,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5404,8 +5138,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5424,9 +5157,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5447,9 +5178,7 @@ ; ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5470,27 +5199,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5503,8 +5230,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5517,11 +5243,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -5532,11 +5257,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -5547,12 +5271,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5565,12 +5287,10 @@ ; ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5592,8 +5312,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5611,9 +5330,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5632,9 +5349,7 @@ ; ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5653,26 +5368,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5684,8 +5397,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5697,11 +5409,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -5710,11 +5421,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -5723,12 +5433,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5739,12 +5447,10 @@ ; ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5764,8 +5470,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5783,9 +5488,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5804,9 +5507,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5825,26 +5526,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5856,8 +5555,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5869,11 +5567,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -5882,11 +5579,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -5895,12 +5591,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5911,12 +5605,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5936,8 +5628,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5956,9 +5647,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5979,9 +5668,7 @@ ; ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6002,27 +5689,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6035,8 +5720,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6049,11 +5733,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6064,11 +5747,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6079,12 +5761,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6097,12 +5777,10 @@ ; ; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6124,8 +5802,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6144,9 +5821,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6167,9 +5842,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6190,27 +5863,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6223,8 +5894,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6237,11 +5907,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6252,11 +5921,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6267,12 +5935,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6285,12 +5951,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6312,8 +5976,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6332,9 +5995,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6355,9 +6016,7 @@ ; ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6378,27 +6037,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6411,8 +6068,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6425,11 +6081,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6440,11 +6095,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6455,12 +6109,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6473,12 +6125,10 @@ ; ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6500,8 +6150,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6520,9 +6169,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6543,9 +6190,7 @@ ; ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6566,27 +6211,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6599,8 +6242,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6613,11 +6255,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6628,11 +6269,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6643,12 +6283,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6661,12 +6299,10 @@ ; ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6688,8 +6324,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6708,9 +6343,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6731,9 +6364,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6754,27 +6385,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6787,8 +6416,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6801,11 +6429,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6816,11 +6443,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -6831,12 +6457,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6849,12 +6473,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6876,8 +6498,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6896,9 +6517,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6919,9 +6538,7 @@ ; ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6942,27 +6559,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6975,8 +6590,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6989,11 +6603,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -7004,11 +6617,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -7019,12 +6631,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7037,12 +6647,10 @@ ; ; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7064,8 +6672,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7084,9 +6691,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -7107,9 +6712,7 @@ ; ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -7130,27 +6733,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7163,8 +6764,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7177,11 +6777,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -7192,11 +6791,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -7207,12 +6805,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7225,12 +6821,10 @@ ; ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7252,8 +6846,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7272,9 +6865,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -7295,9 +6886,7 @@ ; ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -7318,27 +6907,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7351,8 +6938,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7365,11 +6951,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -7380,11 +6965,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -7395,12 +6979,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7413,12 +6995,10 @@ ; ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -9726,8 +9306,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9740,9 +9319,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9755,9 +9332,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9770,22 +9345,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9794,8 +9367,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9804,44 +9376,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9855,8 +9421,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9871,9 +9436,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9889,9 +9452,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9907,23 +9468,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9934,8 +9493,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9946,11 +9504,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -9958,11 +9515,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -9970,12 +9526,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9984,12 +9538,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10005,8 +9557,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10020,9 +9571,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10037,9 +9586,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10054,23 +9601,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10080,8 +9625,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10091,11 +9635,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10103,11 +9646,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10115,12 +9657,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10129,12 +9669,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10150,8 +9688,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10167,9 +9704,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10187,9 +9722,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10207,15 +9740,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10223,8 +9755,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10236,8 +9767,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10249,11 +9779,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10263,11 +9792,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10277,12 +9805,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10293,12 +9819,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10316,8 +9840,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10333,9 +9856,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10353,9 +9874,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10373,15 +9892,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10389,8 +9907,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10402,8 +9919,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10415,11 +9931,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10429,11 +9944,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10443,12 +9957,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10459,12 +9971,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10482,8 +9992,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10498,9 +10007,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10516,9 +10023,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10534,23 +10039,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10561,8 +10064,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10573,11 +10075,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -10585,11 +10086,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -10597,12 +10097,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10611,12 +10109,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10632,8 +10128,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10648,9 +10143,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10666,9 +10159,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10684,23 +10175,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10711,8 +10200,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10723,11 +10211,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -10735,11 +10222,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -10747,12 +10233,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10761,12 +10245,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10782,8 +10264,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10799,9 +10280,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10819,9 +10298,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10839,15 +10316,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10855,8 +10331,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10868,8 +10343,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10881,11 +10355,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10895,11 +10368,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -10909,12 +10381,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10925,12 +10395,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10948,8 +10416,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10965,9 +10432,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10985,9 +10450,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11005,15 +10468,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11021,8 +10483,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11034,8 +10495,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11047,11 +10507,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11061,11 +10520,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11075,12 +10533,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11091,12 +10547,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11114,8 +10568,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11131,9 +10584,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11151,9 +10602,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11171,15 +10620,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11187,8 +10635,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11200,8 +10647,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11213,11 +10659,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11227,11 +10672,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11241,12 +10685,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11257,12 +10699,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11280,8 +10720,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11297,9 +10736,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11317,9 +10754,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11337,15 +10772,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11353,8 +10787,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11366,8 +10799,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11379,11 +10811,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11393,11 +10824,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11407,12 +10837,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11423,12 +10851,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11446,8 +10872,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11463,9 +10888,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11483,9 +10906,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11503,15 +10924,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11519,8 +10939,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11532,8 +10951,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11545,11 +10963,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11559,11 +10976,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11573,12 +10989,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11589,12 +11003,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11612,8 +11024,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11629,9 +11040,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11649,9 +11058,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11669,15 +11076,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11685,8 +11091,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11698,8 +11103,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11711,11 +11115,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11725,11 +11128,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11739,12 +11141,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11755,12 +11155,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11778,8 +11176,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11795,9 +11192,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11815,9 +11210,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11835,15 +11228,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11851,8 +11243,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11864,8 +11255,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11877,11 +11267,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11891,11 +11280,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -11905,12 +11293,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11921,12 +11307,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11944,8 +11328,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11961,9 +11344,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11981,9 +11362,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -12001,15 +11380,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -12017,8 +11395,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12030,8 +11407,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12043,11 +11419,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -12057,11 +11432,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 @@ -12071,12 +11445,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12087,12 +11459,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12110,8 +11480,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12128,9 +11497,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12147,9 +11514,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12166,26 +11531,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12196,8 +11559,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12208,11 +11570,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12220,11 +11581,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12232,12 +11592,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12246,12 +11604,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12269,8 +11625,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12289,9 +11644,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12311,9 +11664,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12333,27 +11684,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12366,8 +11715,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12379,11 +11727,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -12393,11 +11740,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -12406,12 +11752,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12423,12 +11767,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12449,8 +11791,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12468,9 +11809,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12489,9 +11828,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12510,27 +11847,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12542,8 +11877,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12555,11 +11889,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -12569,11 +11902,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -12583,12 +11915,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12599,12 +11929,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12624,8 +11952,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12645,9 +11972,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12669,9 +11994,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12693,28 +12016,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12728,8 +12049,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12742,11 +12062,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -12758,11 +12077,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -12773,12 +12091,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12792,12 +12108,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12820,8 +12134,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12841,9 +12154,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12865,9 +12176,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12889,28 +12198,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12924,8 +12231,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12938,11 +12244,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -12954,11 +12259,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -12969,12 +12273,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12988,12 +12290,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13016,8 +12316,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13036,9 +12335,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13058,9 +12355,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13080,27 +12375,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13113,8 +12406,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13126,11 +12418,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -13140,11 +12431,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -13153,12 +12443,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13170,12 +12458,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13196,8 +12482,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13216,9 +12501,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13238,9 +12521,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13260,27 +12541,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13293,8 +12572,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13306,11 +12584,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 @@ -13320,11 +12597,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 @@ -13333,12 +12609,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13350,12 +12624,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13376,8 +12648,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13397,9 +12668,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13421,9 +12690,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13445,28 +12712,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13480,8 +12745,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13494,11 +12758,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -13510,11 +12773,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -13525,12 +12787,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13544,12 +12804,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13572,8 +12830,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13593,9 +12850,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13617,9 +12872,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13641,28 +12894,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13676,8 +12927,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13690,11 +12940,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -13706,11 +12955,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -13721,12 +12969,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13740,12 +12986,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13768,8 +13012,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13789,9 +13032,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13813,9 +13054,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13837,28 +13076,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13872,8 +13109,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13886,11 +13122,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -13902,11 +13137,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -13917,12 +13151,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13936,12 +13168,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13964,8 +13194,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13985,9 +13214,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14009,9 +13236,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14033,28 +13258,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14068,8 +13291,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14082,11 +13304,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14098,11 +13319,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14113,12 +13333,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14132,12 +13350,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14160,8 +13376,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14181,9 +13396,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14205,9 +13418,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14229,28 +13440,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14264,8 +13473,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14278,11 +13486,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14294,11 +13501,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14309,12 +13515,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14328,12 +13532,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14356,8 +13558,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14377,9 +13578,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14401,9 +13600,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14425,28 +13622,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14460,8 +13655,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14474,11 +13668,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14490,11 +13683,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14505,12 +13697,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14524,12 +13714,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14552,8 +13740,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14573,9 +13760,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14597,9 +13782,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14621,28 +13804,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14656,8 +13837,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14670,11 +13850,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14686,11 +13865,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14701,12 +13879,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14720,12 +13896,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14748,8 +13922,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14769,9 +13942,7 @@ ; ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14793,9 +13964,7 @@ ; ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14817,28 +13986,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14852,8 +14019,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14866,11 +14032,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14882,11 +14047,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 @@ -14897,12 +14061,10 @@ ; ; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14916,12 +14078,10 @@ ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -2008,8 +2008,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2022,9 +2021,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2037,9 +2034,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2052,22 +2047,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2076,8 +2069,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2086,44 +2078,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2137,8 +2123,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2151,9 +2136,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2166,9 +2149,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2181,22 +2162,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2205,8 +2184,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2215,44 +2193,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2266,8 +2238,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2280,9 +2251,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2295,9 +2264,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2310,22 +2277,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2334,8 +2299,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2344,44 +2308,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2395,8 +2353,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2409,9 +2366,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2424,9 +2379,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2439,22 +2392,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2463,8 +2414,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2473,44 +2423,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2524,8 +2468,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2538,9 +2481,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2553,9 +2494,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2568,22 +2507,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2592,8 +2529,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2602,44 +2538,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2653,8 +2583,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2667,9 +2596,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2682,9 +2609,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2697,22 +2622,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2721,8 +2644,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2731,44 +2653,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2782,8 +2698,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2796,9 +2711,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2811,9 +2724,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2826,22 +2737,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2850,8 +2759,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2860,44 +2768,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2911,8 +2813,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2925,9 +2826,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2940,9 +2839,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2955,22 +2852,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2979,8 +2874,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2989,44 +2883,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3040,8 +2928,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3054,9 +2941,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3069,9 +2954,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3084,22 +2967,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3108,8 +2989,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3118,44 +2998,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3169,8 +3043,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3183,9 +3056,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3198,9 +3069,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3213,22 +3082,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3237,8 +3104,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3247,44 +3113,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3298,8 +3158,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3312,9 +3171,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3327,9 +3184,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3342,22 +3197,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3366,8 +3219,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3376,44 +3228,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3427,8 +3273,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3441,9 +3286,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3456,9 +3299,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3471,22 +3312,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3495,8 +3334,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3505,44 +3343,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3556,8 +3388,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3570,9 +3401,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3585,9 +3414,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3600,22 +3427,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3624,8 +3449,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3634,44 +3458,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3685,8 +3503,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3699,9 +3516,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3714,9 +3529,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3729,22 +3542,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3753,8 +3564,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3763,44 +3573,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3814,8 +3618,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3828,9 +3631,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3843,9 +3644,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3858,22 +3657,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3882,8 +3679,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3892,44 +3688,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3943,8 +3733,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -3961,9 +3750,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -3980,9 +3767,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -3999,26 +3784,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4029,8 +3812,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4041,11 +3823,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4053,11 +3834,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4065,12 +3845,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4079,12 +3857,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4102,8 +3878,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4120,9 +3895,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4139,9 +3912,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4158,26 +3929,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4188,8 +3957,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4200,11 +3968,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4212,11 +3979,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4224,12 +3990,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4238,12 +4002,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4261,8 +4023,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4279,9 +4040,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4298,9 +4057,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4317,26 +4074,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4347,8 +4102,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4359,11 +4113,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4371,11 +4124,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4383,12 +4135,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4397,12 +4147,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4420,8 +4168,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4438,9 +4185,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4457,9 +4202,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4476,26 +4219,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4506,8 +4247,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4518,11 +4258,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4530,11 +4269,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4542,12 +4280,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4556,12 +4292,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4579,8 +4313,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4597,9 +4330,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4616,9 +4347,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4635,26 +4364,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4665,8 +4392,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4677,11 +4403,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4689,11 +4414,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4701,12 +4425,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4715,12 +4437,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4738,8 +4458,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4756,9 +4475,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4775,9 +4492,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4794,26 +4509,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4824,8 +4537,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4836,11 +4548,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4848,11 +4559,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4860,12 +4570,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4874,12 +4582,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4897,8 +4603,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4915,9 +4620,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4934,9 +4637,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4953,26 +4654,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4983,8 +4682,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4995,11 +4693,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5007,11 +4704,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5019,12 +4715,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5033,12 +4727,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5056,8 +4748,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5074,9 +4765,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5093,9 +4782,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5112,26 +4799,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5142,8 +4827,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5154,11 +4838,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5166,11 +4849,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5178,12 +4860,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5192,12 +4872,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5215,8 +4893,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5233,9 +4910,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5252,9 +4927,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5271,26 +4944,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5301,8 +4972,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5313,11 +4983,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5325,11 +4994,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5337,12 +5005,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5351,12 +5017,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5374,8 +5038,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5392,9 +5055,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5411,9 +5072,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5430,26 +5089,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5460,8 +5117,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5472,11 +5128,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5484,11 +5139,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5496,12 +5150,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5510,12 +5162,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5533,8 +5183,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5551,9 +5200,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5570,9 +5217,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5589,26 +5234,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5619,8 +5262,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5631,11 +5273,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5643,11 +5284,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5655,12 +5295,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5669,12 +5307,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5692,8 +5328,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5710,9 +5345,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5729,9 +5362,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5748,26 +5379,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5778,8 +5407,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5790,11 +5418,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5802,11 +5429,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5814,12 +5440,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5828,12 +5452,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5851,8 +5473,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5869,9 +5490,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5888,9 +5507,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5907,26 +5524,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5937,8 +5552,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5949,11 +5563,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5961,11 +5574,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5973,12 +5585,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5987,12 +5597,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6010,8 +5618,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6028,9 +5635,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6047,9 +5652,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6066,26 +5669,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6096,8 +5697,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6108,11 +5708,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6120,11 +5719,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6132,12 +5730,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6146,12 +5742,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6169,8 +5763,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6187,9 +5780,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6206,9 +5797,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6225,26 +5814,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6255,8 +5842,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6267,11 +5853,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6279,11 +5864,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6291,12 +5875,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6305,12 +5887,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -8323,8 +7903,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8337,9 +7916,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8352,9 +7929,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8367,22 +7942,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8391,8 +7964,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8401,44 +7973,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8452,8 +8018,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8466,9 +8031,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8481,9 +8044,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8496,22 +8057,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8520,8 +8079,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8530,44 +8088,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8581,8 +8133,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8595,9 +8146,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8610,9 +8159,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8625,22 +8172,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8649,8 +8194,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8659,44 +8203,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8710,8 +8248,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8724,9 +8261,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8739,9 +8274,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8754,22 +8287,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8778,8 +8309,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8788,44 +8318,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8839,8 +8363,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8853,9 +8376,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8868,9 +8389,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8883,22 +8402,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8907,8 +8424,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8917,44 +8433,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8968,8 +8478,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8982,9 +8491,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8997,9 +8504,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9012,22 +8517,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9036,8 +8539,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9046,44 +8548,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9097,8 +8593,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9111,9 +8606,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9126,9 +8619,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9141,22 +8632,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9165,8 +8654,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9175,44 +8663,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9226,8 +8708,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9240,9 +8721,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9255,9 +8734,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9270,22 +8747,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9294,8 +8769,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9304,44 +8778,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9355,8 +8823,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9369,9 +8836,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9384,9 +8849,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9399,22 +8862,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9423,8 +8884,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9433,44 +8893,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9484,8 +8938,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9498,9 +8951,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9513,9 +8964,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9528,22 +8977,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9552,8 +8999,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9562,44 +9008,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9613,8 +9053,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9627,9 +9066,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9642,9 +9079,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9657,22 +9092,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9681,8 +9114,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9691,44 +9123,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9742,8 +9168,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9756,9 +9181,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9771,9 +9194,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9786,22 +9207,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9810,8 +9229,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9820,44 +9238,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9871,8 +9283,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9885,9 +9296,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9900,9 +9309,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9915,22 +9322,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9939,8 +9344,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9949,44 +9353,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10000,8 +9398,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10014,9 +9411,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10029,9 +9424,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10044,22 +9437,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10068,8 +9459,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10078,44 +9468,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10129,8 +9513,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10143,9 +9526,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10158,9 +9539,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10173,22 +9552,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10197,8 +9574,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10207,44 +9583,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10258,8 +9628,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10276,9 +9645,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10295,9 +9662,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10314,26 +9679,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10344,8 +9707,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10356,11 +9718,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10368,11 +9729,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10380,12 +9740,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10394,12 +9752,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10417,8 +9773,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10435,9 +9790,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10454,9 +9807,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10473,26 +9824,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10503,8 +9852,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10515,11 +9863,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10527,11 +9874,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10539,12 +9885,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10553,12 +9897,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10576,8 +9918,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10594,9 +9935,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10613,9 +9952,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10632,26 +9969,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10662,8 +9997,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10674,11 +10008,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10686,11 +10019,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10698,12 +10030,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10712,12 +10042,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10735,8 +10063,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10753,9 +10080,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10772,9 +10097,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10791,26 +10114,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10821,8 +10142,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10833,11 +10153,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10845,11 +10164,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10857,12 +10175,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10871,12 +10187,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10894,8 +10208,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10912,9 +10225,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10931,9 +10242,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10950,26 +10259,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10980,8 +10287,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10992,11 +10298,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11004,11 +10309,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11016,12 +10320,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11030,12 +10332,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11053,8 +10353,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11071,9 +10370,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11090,9 +10387,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11109,26 +10404,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11139,8 +10432,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11151,11 +10443,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11163,11 +10454,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11175,12 +10465,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11189,12 +10477,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11212,8 +10498,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11230,9 +10515,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11249,9 +10532,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11268,26 +10549,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11298,8 +10577,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11310,11 +10588,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11322,11 +10599,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11334,12 +10610,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11348,12 +10622,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11371,8 +10643,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11389,9 +10660,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11408,9 +10677,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11427,26 +10694,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11457,8 +10722,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11469,11 +10733,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11481,11 +10744,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11493,12 +10755,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11507,12 +10767,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11530,8 +10788,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11548,9 +10805,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11567,9 +10822,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11586,26 +10839,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11616,8 +10867,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11628,11 +10878,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11640,11 +10889,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11652,12 +10900,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11666,12 +10912,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11689,8 +10933,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11707,9 +10950,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11726,9 +10967,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11745,26 +10984,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11775,8 +11012,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11787,11 +11023,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11799,11 +11034,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11811,12 +11045,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11825,12 +11057,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11848,8 +11078,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11866,9 +11095,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11885,9 +11112,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11904,26 +11129,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11934,8 +11157,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11946,11 +11168,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11958,11 +11179,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11970,12 +11190,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11984,12 +11202,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12007,8 +11223,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12025,9 +11240,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12044,9 +11257,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12063,26 +11274,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12093,8 +11302,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12105,11 +11313,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12117,11 +11324,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12129,12 +11335,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12143,12 +11347,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12166,8 +11368,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12184,9 +11385,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12203,9 +11402,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12222,26 +11419,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12252,8 +11447,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12264,11 +11458,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12276,11 +11469,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12288,12 +11480,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12302,12 +11492,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12325,8 +11513,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12343,9 +11530,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12362,9 +11547,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12381,26 +11564,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12411,8 +11592,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12423,11 +11603,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12435,11 +11614,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12447,12 +11625,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12461,12 +11637,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12484,8 +11658,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12502,9 +11675,7 @@ ; ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12521,9 +11692,7 @@ ; ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12540,26 +11709,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12570,8 +11737,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12582,11 +11748,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12594,11 +11759,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12606,12 +11770,10 @@ ; ; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12620,12 +11782,10 @@ ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -2304,8 +2304,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2318,9 +2317,7 @@ ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2333,9 +2330,7 @@ ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2348,22 +2343,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2372,8 +2365,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2382,44 +2374,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2433,8 +2419,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2449,9 +2434,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2468,9 +2451,7 @@ ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2487,23 +2468,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2515,8 +2494,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2528,11 +2506,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -2540,11 +2517,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -2552,12 +2528,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2567,12 +2541,10 @@ ; ; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2589,8 +2561,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2604,9 +2575,7 @@ ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2621,9 +2590,7 @@ ; ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2638,23 +2605,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2665,8 +2630,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2677,11 +2641,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -2689,11 +2652,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -2701,12 +2663,10 @@ ; ; GFX11-WGP-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2715,12 +2675,10 @@ ; ; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2736,8 +2694,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2753,9 +2710,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2774,9 +2729,7 @@ ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2795,15 +2748,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2811,8 +2763,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2826,8 +2777,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2841,11 +2791,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -2855,11 +2804,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -2869,12 +2817,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2886,12 +2832,10 @@ ; ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2910,8 +2854,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2927,9 +2870,7 @@ ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2948,9 +2889,7 @@ ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2969,15 +2908,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2985,8 +2923,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3000,8 +2937,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3015,11 +2951,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3029,11 +2964,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3043,12 +2977,10 @@ ; ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3060,12 +2992,10 @@ ; ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3084,8 +3014,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3100,9 +3029,7 @@ ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3119,9 +3046,7 @@ ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3138,23 +3063,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3166,8 +3089,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3179,11 +3101,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -3191,11 +3112,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -3203,12 +3123,10 @@ ; ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3218,12 +3136,10 @@ ; ; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3240,8 +3156,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3256,9 +3171,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3275,9 +3188,7 @@ ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3294,23 +3205,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3322,8 +3231,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3335,11 +3243,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -3347,11 +3254,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -3359,12 +3265,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3374,12 +3278,10 @@ ; ; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3396,8 +3298,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3413,9 +3314,7 @@ ; ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3434,9 +3333,7 @@ ; ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3455,15 +3352,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3471,8 +3367,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3486,8 +3381,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3501,11 +3395,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3515,11 +3408,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3529,12 +3421,10 @@ ; ; GFX11-WGP-LABEL: flat_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3546,12 +3436,10 @@ ; ; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3570,8 +3458,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3587,9 +3474,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3608,9 +3493,7 @@ ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3629,15 +3512,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3645,8 +3527,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3660,8 +3541,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3675,11 +3555,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3689,11 +3568,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3703,12 +3581,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3720,12 +3596,10 @@ ; ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3744,8 +3618,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3761,9 +3634,7 @@ ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3782,9 +3653,7 @@ ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3803,15 +3672,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3819,8 +3687,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3834,8 +3701,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3849,11 +3715,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3863,11 +3728,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -3877,12 +3741,10 @@ ; ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3894,12 +3756,10 @@ ; ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3918,8 +3778,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3935,9 +3794,7 @@ ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3956,9 +3813,7 @@ ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3977,15 +3832,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3993,8 +3847,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4008,8 +3861,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4023,11 +3875,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4037,11 +3888,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4051,12 +3901,10 @@ ; ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4068,12 +3916,10 @@ ; ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4092,8 +3938,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4109,9 +3954,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4130,9 +3973,7 @@ ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4151,15 +3992,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4167,8 +4007,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4182,8 +4021,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4197,11 +4035,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4211,11 +4048,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4225,12 +4061,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4242,12 +4076,10 @@ ; ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4266,8 +4098,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4283,9 +4114,7 @@ ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4304,9 +4133,7 @@ ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4325,15 +4152,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4341,8 +4167,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4356,8 +4181,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4371,11 +4195,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4385,11 +4208,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4399,12 +4221,10 @@ ; ; GFX11-WGP-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4416,12 +4236,10 @@ ; ; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4440,8 +4258,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4457,9 +4274,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4478,9 +4293,7 @@ ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4499,15 +4312,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4515,8 +4327,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4530,8 +4341,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4545,11 +4355,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4559,11 +4368,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4573,12 +4381,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4590,12 +4396,10 @@ ; ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4614,8 +4418,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4631,9 +4434,7 @@ ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -4652,9 +4453,7 @@ ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -4673,15 +4472,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4689,8 +4487,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4704,8 +4501,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4719,11 +4515,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4733,11 +4528,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -4747,12 +4541,10 @@ ; ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4764,12 +4556,10 @@ ; ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4788,8 +4578,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4806,9 +4595,7 @@ ; ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4825,9 +4612,7 @@ ; ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4844,26 +4629,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4874,8 +4657,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4886,11 +4668,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4898,11 +4679,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4910,12 +4690,10 @@ ; ; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4924,12 +4702,10 @@ ; ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4947,8 +4723,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4966,9 +4741,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4987,9 +4760,7 @@ ; ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5008,26 +4779,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5040,8 +4809,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5054,11 +4822,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -5067,11 +4834,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -5080,12 +4846,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5096,12 +4860,10 @@ ; ; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5121,8 +4883,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5140,9 +4901,7 @@ ; ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5161,9 +4920,7 @@ ; ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5182,27 +4939,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5215,8 +4970,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5229,11 +4983,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -5243,11 +4996,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -5257,12 +5009,10 @@ ; ; GFX11-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5273,12 +5023,10 @@ ; ; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5298,8 +5046,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5318,9 +5065,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5341,9 +5086,7 @@ ; ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5364,27 +5107,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5399,8 +5140,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5415,11 +5155,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -5430,11 +5169,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -5445,12 +5183,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5463,12 +5199,10 @@ ; ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5490,8 +5224,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5510,9 +5243,7 @@ ; ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5533,9 +5264,7 @@ ; ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5556,27 +5285,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5591,8 +5318,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5607,11 +5333,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -5622,11 +5347,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -5637,12 +5361,10 @@ ; ; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5655,12 +5377,10 @@ ; ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5682,8 +5402,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5701,9 +5420,7 @@ ; ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5722,9 +5439,7 @@ ; ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5743,26 +5458,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5775,8 +5488,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5789,11 +5501,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -5802,11 +5513,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -5815,12 +5525,10 @@ ; ; GFX11-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5831,12 +5539,10 @@ ; ; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5856,8 +5562,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5875,9 +5580,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5896,9 +5599,7 @@ ; ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5917,26 +5618,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5949,8 +5648,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5963,11 +5661,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -5976,11 +5673,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -5989,12 +5685,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6005,12 +5699,10 @@ ; ; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6030,8 +5722,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6050,9 +5741,7 @@ ; ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6073,9 +5762,7 @@ ; ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6096,27 +5783,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6131,8 +5816,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6147,11 +5831,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6162,11 +5845,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6177,12 +5859,10 @@ ; ; GFX11-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6195,12 +5875,10 @@ ; ; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6222,8 +5900,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6242,9 +5919,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6265,9 +5940,7 @@ ; ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6288,27 +5961,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6323,8 +5994,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6339,11 +6009,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6354,11 +6023,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6369,12 +6037,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6387,12 +6053,10 @@ ; ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6414,8 +6078,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6434,9 +6097,7 @@ ; ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6457,9 +6118,7 @@ ; ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6480,27 +6139,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6515,8 +6172,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6531,11 +6187,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6546,11 +6201,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6561,12 +6215,10 @@ ; ; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6579,12 +6231,10 @@ ; ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6606,8 +6256,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6626,9 +6275,7 @@ ; ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6649,9 +6296,7 @@ ; ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6672,27 +6317,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6707,8 +6350,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6723,11 +6365,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6738,11 +6379,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6753,12 +6393,10 @@ ; ; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6771,12 +6409,10 @@ ; ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6798,8 +6434,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6818,9 +6453,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6841,9 +6474,7 @@ ; ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6864,27 +6495,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6899,8 +6528,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6915,11 +6543,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6930,11 +6557,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -6945,12 +6571,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6963,12 +6587,10 @@ ; ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6990,8 +6612,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7010,9 +6631,7 @@ ; ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -7033,9 +6652,7 @@ ; ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -7056,27 +6673,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7091,8 +6706,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7107,11 +6721,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -7122,11 +6735,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -7137,12 +6749,10 @@ ; ; GFX11-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7155,12 +6765,10 @@ ; ; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7182,8 +6790,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7202,9 +6809,7 @@ ; ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -7225,9 +6830,7 @@ ; ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -7248,27 +6851,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7283,8 +6884,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7299,11 +6899,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -7314,11 +6913,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -7329,12 +6927,10 @@ ; ; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7347,12 +6943,10 @@ ; ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7374,8 +6968,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7394,9 +6987,7 @@ ; ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -7417,9 +7008,7 @@ ; ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -7440,27 +7029,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7475,8 +7062,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -7491,11 +7077,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -7506,11 +7091,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -7521,12 +7105,10 @@ ; ; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7539,12 +7121,10 @@ ; ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -9882,8 +9462,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9896,9 +9475,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9911,9 +9488,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9926,22 +9501,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9950,8 +9523,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9960,44 +9532,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10011,8 +9577,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10027,9 +9592,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10045,9 +9608,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10063,23 +9624,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10091,8 +9650,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10104,11 +9662,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -10116,11 +9673,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -10128,12 +9684,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10142,12 +9696,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10163,8 +9715,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10178,9 +9729,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10195,9 +9744,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10212,23 +9759,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10239,8 +9784,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10251,11 +9795,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -10263,11 +9806,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -10275,12 +9817,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10289,12 +9829,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10310,8 +9848,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10327,9 +9864,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10347,9 +9882,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10367,15 +9900,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10383,8 +9915,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10398,8 +9929,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10413,11 +9943,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -10427,11 +9956,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -10441,12 +9969,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10457,12 +9983,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10480,8 +10004,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10497,9 +10020,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10517,9 +10038,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10537,15 +10056,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -10553,8 +10071,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10568,8 +10085,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10583,11 +10099,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -10597,11 +10112,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -10611,12 +10125,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10627,12 +10139,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10650,8 +10160,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10666,9 +10175,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10684,9 +10191,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10702,23 +10207,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10730,8 +10233,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10743,11 +10245,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -10755,11 +10256,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -10767,12 +10267,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10781,12 +10279,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10802,8 +10298,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10818,9 +10313,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10836,9 +10329,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10854,23 +10345,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10882,8 +10371,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10895,11 +10383,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -10907,11 +10394,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -10919,12 +10405,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10933,12 +10417,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10954,8 +10436,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10971,9 +10452,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10991,9 +10470,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11011,15 +10488,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11027,8 +10503,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11042,8 +10517,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11057,11 +10531,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11071,11 +10544,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11085,12 +10557,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11101,12 +10571,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11124,8 +10592,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11141,9 +10608,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11161,9 +10626,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11181,15 +10644,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11197,8 +10659,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11212,8 +10673,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11227,11 +10687,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11241,11 +10700,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11255,12 +10713,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11271,12 +10727,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11294,8 +10748,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11311,9 +10764,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11331,9 +10782,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11351,15 +10800,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11367,8 +10815,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11382,8 +10829,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11397,11 +10843,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11411,11 +10856,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11425,12 +10869,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11441,12 +10883,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11464,8 +10904,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11481,9 +10920,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11501,9 +10938,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11521,15 +10956,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11537,8 +10971,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11552,8 +10985,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11567,11 +10999,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11581,11 +11012,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11595,12 +11025,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11611,12 +11039,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11634,8 +11060,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11651,9 +11076,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11671,9 +11094,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11691,15 +11112,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11707,8 +11127,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11722,8 +11141,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11737,11 +11155,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11751,11 +11168,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11765,12 +11181,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11781,12 +11195,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11804,8 +11216,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11821,9 +11232,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -11841,9 +11250,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -11861,15 +11268,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -11877,8 +11283,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11892,8 +11297,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11907,11 +11311,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11921,11 +11324,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -11935,12 +11337,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11951,12 +11351,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11974,8 +11372,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11991,9 +11388,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -12011,9 +11406,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -12031,15 +11424,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -12047,8 +11439,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12062,8 +11453,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12077,11 +11467,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -12091,11 +11480,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -12105,12 +11493,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12121,12 +11507,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12144,8 +11528,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -12161,9 +11544,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -12181,9 +11562,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -12201,15 +11580,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -12217,8 +11595,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12232,8 +11609,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12247,11 +11623,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -12261,11 +11636,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1 @@ -12275,12 +11649,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12291,12 +11663,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12314,8 +11684,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12332,9 +11701,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12351,9 +11718,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12370,26 +11735,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12400,8 +11763,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12412,11 +11774,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12424,11 +11785,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12436,12 +11796,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12450,12 +11808,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12473,8 +11829,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12493,9 +11848,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12515,9 +11868,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12537,27 +11888,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12571,8 +11920,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12585,11 +11933,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -12599,11 +11946,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -12612,12 +11958,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12629,12 +11973,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12655,8 +11997,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12674,9 +12015,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12695,9 +12034,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12716,27 +12053,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12749,8 +12084,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12763,11 +12097,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -12777,11 +12110,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -12791,12 +12123,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12807,12 +12137,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12832,8 +12160,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12853,9 +12180,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12877,9 +12202,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12901,28 +12224,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12938,8 +12259,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12954,11 +12274,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -12970,11 +12289,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -12985,12 +12303,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13004,12 +12320,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13032,8 +12346,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13053,9 +12366,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13077,9 +12388,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13101,28 +12410,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13138,8 +12445,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13154,11 +12460,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -13170,11 +12475,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -13185,12 +12489,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13204,12 +12506,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13232,8 +12532,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13252,9 +12551,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13274,9 +12571,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13296,27 +12591,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13330,8 +12623,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13344,11 +12636,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -13358,11 +12649,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -13371,12 +12661,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13388,12 +12676,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13414,8 +12700,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13434,9 +12719,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13456,9 +12739,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13478,27 +12759,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13512,8 +12791,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13526,11 +12804,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -13540,11 +12817,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 @@ -13553,12 +12829,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13570,12 +12844,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13596,8 +12868,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13617,9 +12888,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13641,9 +12910,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13665,28 +12932,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13702,8 +12967,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13718,11 +12982,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -13734,11 +12997,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -13749,12 +13011,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13768,12 +13028,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13796,8 +13054,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13817,9 +13074,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13841,9 +13096,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13865,28 +13118,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13902,8 +13153,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13918,11 +13168,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -13934,11 +13183,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -13949,12 +13197,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13968,12 +13214,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13996,8 +13240,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14017,9 +13260,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14041,9 +13282,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14065,28 +13304,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14102,8 +13339,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14118,11 +13354,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14134,11 +13369,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14149,12 +13383,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14168,12 +13400,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14196,8 +13426,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14217,9 +13446,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14241,9 +13468,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14265,28 +13490,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14302,8 +13525,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14318,11 +13540,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14334,11 +13555,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14349,12 +13569,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14368,12 +13586,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14396,8 +13612,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14417,9 +13632,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14441,9 +13654,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14465,28 +13676,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14502,8 +13711,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14518,11 +13726,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14534,11 +13741,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14549,12 +13755,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14568,12 +13772,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14596,8 +13798,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14617,9 +13818,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14641,9 +13840,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14665,28 +13862,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14702,8 +13897,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14718,11 +13912,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14734,11 +13927,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14749,12 +13941,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14768,12 +13958,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14796,8 +13984,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14817,9 +14004,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -14841,9 +14026,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -14865,28 +14048,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14902,8 +14083,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -14918,11 +14098,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14934,11 +14113,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -14949,12 +14127,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14968,12 +14144,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14996,8 +14170,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -15017,9 +14190,7 @@ ; ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -15041,9 +14212,7 @@ ; ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -15065,28 +14234,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -15102,8 +14269,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -15118,11 +14284,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -15134,11 +14299,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 sc1 @@ -15149,12 +14313,10 @@ ; ; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15168,12 +14330,10 @@ ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -2008,8 +2008,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2022,9 +2021,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2037,9 +2034,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2052,22 +2047,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2076,8 +2069,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2086,44 +2078,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2137,8 +2123,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2151,9 +2136,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2166,9 +2149,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2181,22 +2162,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2205,8 +2184,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2215,44 +2193,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2266,8 +2238,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2280,9 +2251,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2295,9 +2264,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2310,22 +2277,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2334,8 +2299,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2344,44 +2308,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2395,8 +2353,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2409,9 +2366,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2424,9 +2379,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2439,22 +2392,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2463,8 +2414,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2473,44 +2423,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2524,8 +2468,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2538,9 +2481,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2553,9 +2494,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2568,22 +2507,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2592,8 +2529,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2602,44 +2538,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2653,8 +2583,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2667,9 +2596,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2682,9 +2609,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2697,22 +2622,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2721,8 +2644,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2731,44 +2653,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2782,8 +2698,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2796,9 +2711,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2811,9 +2724,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2826,22 +2737,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2850,8 +2759,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2860,44 +2768,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2911,8 +2813,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2925,9 +2826,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2940,9 +2839,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2955,22 +2852,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2979,8 +2874,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2989,44 +2883,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3040,8 +2928,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3054,9 +2941,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3069,9 +2954,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3084,22 +2967,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3108,8 +2989,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3118,44 +2998,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3169,8 +3043,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3183,9 +3056,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3198,9 +3069,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3213,22 +3082,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3237,8 +3104,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3247,44 +3113,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3298,8 +3158,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3312,9 +3171,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3327,9 +3184,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3342,22 +3197,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3366,8 +3219,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3376,44 +3228,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3427,8 +3273,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3441,9 +3286,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3456,9 +3299,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3471,22 +3312,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3495,8 +3334,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3505,44 +3343,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3556,8 +3388,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3570,9 +3401,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3585,9 +3414,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3600,22 +3427,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3624,8 +3449,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3634,44 +3458,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3685,8 +3503,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3699,9 +3516,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3714,9 +3529,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3729,22 +3542,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3753,8 +3564,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3763,44 +3573,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3814,8 +3618,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3828,9 +3631,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3843,9 +3644,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3858,22 +3657,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3882,8 +3679,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3892,44 +3688,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3943,8 +3733,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -3961,9 +3750,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -3979,10 +3766,8 @@ ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: -; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -3999,26 +3784,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4029,8 +3812,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4041,11 +3823,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4053,11 +3834,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4065,12 +3845,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4079,12 +3857,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4102,8 +3878,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4120,9 +3895,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4139,9 +3912,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4158,26 +3929,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4188,8 +3957,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4200,11 +3968,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4212,11 +3979,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4224,12 +3990,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4238,12 +4002,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4261,8 +4023,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4279,9 +4040,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4298,9 +4057,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4317,26 +4074,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4347,8 +4102,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4359,11 +4113,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4371,11 +4124,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4383,12 +4135,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4397,12 +4147,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4420,8 +4168,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4438,9 +4185,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4457,9 +4202,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4476,26 +4219,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4506,8 +4247,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4518,11 +4258,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4530,11 +4269,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4542,12 +4280,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4556,12 +4292,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4579,8 +4313,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4597,9 +4330,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4616,9 +4347,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4635,26 +4364,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4665,8 +4392,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4677,11 +4403,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4689,11 +4414,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4701,12 +4425,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4715,12 +4437,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4738,8 +4458,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4756,9 +4475,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4775,9 +4492,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4794,26 +4509,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4824,8 +4537,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4836,11 +4548,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4848,11 +4559,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4860,12 +4570,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4874,12 +4582,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4897,8 +4603,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4915,9 +4620,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4934,9 +4637,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4953,26 +4654,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4983,8 +4682,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4995,11 +4693,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5007,11 +4704,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5019,12 +4715,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5033,12 +4727,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5056,8 +4748,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5074,9 +4765,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5093,9 +4782,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5112,26 +4799,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5142,8 +4827,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5154,11 +4838,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5166,11 +4849,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5178,12 +4860,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5192,12 +4872,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5215,8 +4893,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5233,9 +4910,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5252,9 +4927,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5271,26 +4944,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5301,8 +4972,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5313,11 +4983,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5325,11 +4994,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5337,12 +5005,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5351,12 +5017,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5374,8 +5038,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5392,9 +5055,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5411,9 +5072,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5430,26 +5089,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5460,8 +5117,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5472,11 +5128,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5484,11 +5139,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5496,12 +5150,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5510,12 +5162,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5533,8 +5183,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5551,9 +5200,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5570,9 +5217,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5589,26 +5234,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5619,8 +5262,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5631,11 +5273,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5643,11 +5284,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5655,12 +5295,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5669,12 +5307,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5692,8 +5328,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5710,9 +5345,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5729,9 +5362,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5748,26 +5379,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5778,8 +5407,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5790,11 +5418,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5802,11 +5429,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5814,12 +5440,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5828,12 +5452,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5851,8 +5473,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5869,9 +5490,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5888,9 +5507,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5907,26 +5524,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5937,8 +5552,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5949,11 +5563,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5961,11 +5574,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -5973,12 +5585,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5987,12 +5597,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6010,8 +5618,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6028,9 +5635,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6047,9 +5652,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6066,26 +5669,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6096,8 +5697,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6108,11 +5708,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6120,11 +5719,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6132,12 +5730,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6146,12 +5742,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6169,8 +5763,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6187,9 +5780,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6206,9 +5797,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6225,26 +5814,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6255,8 +5842,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6267,11 +5853,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -6279,24 +5864,21 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: -; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP: ; %bb.0: ; %entry +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6305,12 +5887,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -8323,8 +7903,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8337,9 +7916,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8352,9 +7929,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8367,22 +7942,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8391,8 +7964,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8401,44 +7973,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8452,8 +8018,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8466,9 +8031,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8481,9 +8044,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8496,22 +8057,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8520,8 +8079,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8530,44 +8088,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8581,8 +8133,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8595,9 +8146,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8610,9 +8159,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8625,22 +8172,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8649,8 +8194,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8659,44 +8203,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8710,8 +8248,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8724,9 +8261,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8739,9 +8274,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8754,22 +8287,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8778,8 +8309,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8788,44 +8318,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8839,8 +8363,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8853,9 +8376,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8868,9 +8389,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8883,22 +8402,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8907,8 +8424,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8917,44 +8433,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8968,8 +8478,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8982,9 +8491,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8997,9 +8504,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9012,22 +8517,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9036,8 +8539,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9046,44 +8548,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9097,8 +8593,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9111,9 +8606,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9126,9 +8619,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9141,22 +8632,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9165,8 +8654,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9175,44 +8663,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9226,8 +8708,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9240,9 +8721,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9255,9 +8734,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9270,22 +8747,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9294,8 +8769,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9304,44 +8778,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9355,8 +8823,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9369,9 +8836,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9384,9 +8849,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9399,22 +8862,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9423,8 +8884,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9433,44 +8893,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9484,8 +8938,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9498,9 +8951,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9513,9 +8964,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9528,22 +8977,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9552,8 +8999,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9562,44 +9008,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9613,8 +9053,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9627,9 +9066,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9642,9 +9079,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9657,22 +9092,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9681,8 +9114,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9691,44 +9123,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9742,8 +9168,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9756,9 +9181,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9771,9 +9194,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9786,22 +9207,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9810,8 +9229,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9820,44 +9238,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9871,8 +9283,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9885,9 +9296,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9900,9 +9309,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9915,22 +9322,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9939,8 +9344,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9949,44 +9353,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10000,8 +9398,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10014,9 +9411,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10029,9 +9424,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10044,22 +9437,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10068,8 +9459,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10078,44 +9468,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10129,8 +9513,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10143,9 +9526,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10158,9 +9539,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10173,22 +9552,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10197,8 +9574,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10207,44 +9583,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10258,8 +9628,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10276,9 +9645,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10295,9 +9662,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10314,26 +9679,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10344,8 +9707,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10356,11 +9718,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10368,11 +9729,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10380,12 +9740,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10394,12 +9752,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10417,8 +9773,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10435,9 +9790,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10454,9 +9807,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10473,26 +9824,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10503,8 +9852,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10515,11 +9863,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10527,11 +9874,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10539,12 +9885,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10553,12 +9897,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10576,8 +9918,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10594,9 +9935,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10613,9 +9952,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10632,26 +9969,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10662,8 +9997,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10674,11 +10008,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10686,11 +10019,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10698,12 +10030,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10712,12 +10042,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10735,8 +10063,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10753,9 +10080,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10772,9 +10097,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10791,26 +10114,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10821,8 +10142,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10833,11 +10153,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10845,11 +10164,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10857,12 +10175,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10871,12 +10187,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10894,8 +10208,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10912,9 +10225,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10931,9 +10242,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10950,26 +10259,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10980,8 +10287,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10992,11 +10298,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11004,11 +10309,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11016,12 +10320,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11030,12 +10332,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11053,8 +10353,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11071,9 +10370,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11090,9 +10387,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11109,26 +10404,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11139,8 +10432,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11151,11 +10443,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11163,11 +10454,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11175,12 +10465,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11189,12 +10477,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11212,8 +10498,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11230,9 +10515,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11249,9 +10532,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11268,26 +10549,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11298,8 +10577,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11310,11 +10588,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11322,11 +10599,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11334,12 +10610,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11348,12 +10622,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11371,8 +10643,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11389,9 +10660,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11408,9 +10677,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11427,26 +10694,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11457,8 +10722,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11469,11 +10733,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11481,11 +10744,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11493,12 +10755,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11507,12 +10767,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11530,8 +10788,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11548,9 +10805,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11567,9 +10822,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11586,26 +10839,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11616,8 +10867,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11628,11 +10878,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11640,11 +10889,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11652,12 +10900,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11666,12 +10912,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11689,8 +10933,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11707,9 +10950,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11726,9 +10967,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11745,26 +10984,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11775,8 +11012,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11787,11 +11023,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11799,11 +11034,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11811,12 +11045,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11825,12 +11057,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11848,8 +11078,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11866,9 +11095,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11885,9 +11112,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11904,26 +11129,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11934,8 +11157,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11946,11 +11168,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11958,11 +11179,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11970,12 +11190,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11984,12 +11202,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12007,8 +11223,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12025,9 +11240,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12044,9 +11257,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12063,26 +11274,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12093,8 +11302,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12105,11 +11313,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12117,11 +11324,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12129,12 +11335,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12143,12 +11347,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12166,8 +11368,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12184,9 +11385,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12203,9 +11402,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12222,26 +11419,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12252,8 +11447,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12264,11 +11458,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12276,11 +11469,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12288,12 +11480,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12302,12 +11492,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12325,8 +11513,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12343,9 +11530,7 @@ ; ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12362,9 +11547,7 @@ ; ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12381,26 +11564,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12411,8 +11592,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12423,11 +11603,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12435,11 +11614,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12447,12 +11625,10 @@ ; ; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12461,12 +11637,10 @@ ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -2181,8 +2181,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2195,9 +2194,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2210,9 +2207,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2225,22 +2220,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2249,8 +2242,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2259,44 +2251,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2310,8 +2296,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2325,9 +2310,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2343,9 +2326,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2359,23 +2340,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2385,8 +2364,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2397,22 +2375,20 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -2420,12 +2396,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2434,12 +2408,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2454,8 +2426,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2469,9 +2440,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2486,9 +2455,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2502,23 +2469,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2528,8 +2493,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2539,34 +2503,30 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2575,12 +2535,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2595,8 +2553,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2611,9 +2568,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2631,9 +2586,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2648,15 +2601,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2664,8 +2616,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2676,8 +2627,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2689,11 +2639,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2701,11 +2650,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -2714,12 +2662,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2730,12 +2676,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2751,8 +2695,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2767,9 +2710,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2787,9 +2728,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2804,15 +2743,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2820,8 +2758,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2832,8 +2769,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2845,11 +2781,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -2857,11 +2792,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -2870,12 +2804,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2886,12 +2818,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2907,8 +2837,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2922,9 +2851,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -2940,9 +2867,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -2956,23 +2881,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2982,8 +2905,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -2994,22 +2916,20 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -3017,12 +2937,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3031,12 +2949,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3051,8 +2967,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3066,9 +2981,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3084,9 +2997,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3100,23 +3011,21 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3126,8 +3035,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3138,22 +3046,20 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -3161,12 +3067,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3175,12 +3079,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3195,8 +3097,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3211,9 +3112,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3231,9 +3130,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3248,15 +3145,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3264,8 +3160,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3276,8 +3171,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3289,11 +3183,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3301,11 +3194,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -3314,12 +3206,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3330,12 +3220,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3351,8 +3239,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3367,9 +3254,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3387,9 +3272,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3404,15 +3287,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3420,8 +3302,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3432,8 +3313,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3445,11 +3325,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3457,11 +3336,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -3470,12 +3348,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3486,12 +3362,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3507,8 +3381,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3523,9 +3396,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3543,9 +3414,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3560,15 +3429,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3576,8 +3444,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3588,8 +3455,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3601,11 +3467,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3613,11 +3478,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -3626,12 +3490,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3642,12 +3504,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3663,8 +3523,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3679,9 +3538,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -3699,9 +3556,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -3716,15 +3571,14 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3732,8 +3586,7 @@ ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3744,8 +3597,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3757,11 +3609,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) @@ -3769,11 +3620,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -3782,12 +3632,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3798,12 +3646,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3819,8 +3665,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -3837,9 +3682,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -3856,9 +3699,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -3875,26 +3716,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3905,8 +3744,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -3917,11 +3755,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3929,11 +3766,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -3941,12 +3777,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -3955,12 +3789,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -3978,8 +3810,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -3997,9 +3828,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4017,9 +3846,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4037,27 +3864,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4068,8 +3893,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4081,11 +3905,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4093,11 +3916,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -4106,12 +3928,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4121,12 +3941,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4144,8 +3962,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4163,9 +3980,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4184,9 +3999,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4204,27 +4017,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4236,8 +4047,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4249,11 +4059,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4262,11 +4071,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -4275,12 +4083,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -4291,12 +4097,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4315,8 +4119,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4335,9 +4138,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4357,9 +4158,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4378,28 +4177,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4411,8 +4208,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4425,11 +4221,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4438,11 +4233,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -4452,12 +4246,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -4469,12 +4261,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4493,8 +4283,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4513,9 +4302,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4535,9 +4322,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4556,28 +4341,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4589,8 +4372,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4603,11 +4385,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4616,11 +4397,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -4630,12 +4410,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -4647,12 +4425,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4671,8 +4447,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4690,9 +4465,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4710,9 +4483,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4730,27 +4501,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4761,8 +4530,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4774,11 +4542,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4786,11 +4553,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -4799,12 +4565,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4814,12 +4578,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4837,8 +4599,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4856,9 +4617,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -4876,9 +4635,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -4896,27 +4653,25 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4927,8 +4682,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -4940,11 +4694,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -4952,11 +4705,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -4965,12 +4717,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4980,12 +4730,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5003,8 +4751,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5023,9 +4770,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5045,9 +4790,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5066,28 +4809,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5099,8 +4840,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5113,11 +4853,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5126,11 +4865,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -5140,12 +4878,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5157,12 +4893,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5181,8 +4915,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5201,9 +4934,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5223,9 +4954,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5244,28 +4973,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5277,8 +5004,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5291,11 +5017,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5304,11 +5029,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -5318,12 +5042,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5335,12 +5057,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5359,8 +5079,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5379,9 +5098,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5401,9 +5118,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5422,28 +5137,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5455,8 +5168,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5469,11 +5181,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5482,11 +5193,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -5496,12 +5206,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5513,12 +5221,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5537,8 +5243,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5557,9 +5262,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5579,9 +5282,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5600,28 +5301,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5633,8 +5332,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5647,11 +5345,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5660,11 +5357,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -5674,12 +5370,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5691,12 +5385,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5715,8 +5407,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5735,9 +5426,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5757,9 +5446,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5778,28 +5465,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5811,8 +5496,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5825,11 +5509,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5838,11 +5521,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -5852,12 +5534,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5869,12 +5549,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5893,8 +5571,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5913,9 +5590,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -5935,9 +5610,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -5956,28 +5629,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -5989,8 +5660,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6003,11 +5673,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6016,11 +5685,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -6030,12 +5698,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6047,12 +5713,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6071,8 +5735,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6091,9 +5754,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6113,9 +5774,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6134,28 +5793,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6167,8 +5824,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6181,11 +5837,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6194,11 +5849,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -6208,12 +5862,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6225,12 +5877,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6249,8 +5899,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6269,9 +5918,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -6291,9 +5938,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -6312,28 +5957,26 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6345,8 +5988,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -6359,11 +6001,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6372,11 +6013,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -6386,12 +6026,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6403,12 +6041,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8521,8 +8157,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8535,9 +8170,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8550,9 +8183,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8565,22 +8196,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8589,8 +8218,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8599,44 +8227,38 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8650,8 +8272,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8664,9 +8285,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8681,9 +8300,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8696,22 +8313,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8720,8 +8335,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8732,21 +8346,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -8754,12 +8366,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8767,12 +8377,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8786,8 +8394,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8800,9 +8407,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8817,9 +8422,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8832,22 +8435,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8856,8 +8457,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8867,33 +8467,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -8902,12 +8498,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8921,8 +8515,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8935,9 +8528,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -8954,9 +8545,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -8969,22 +8558,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -8993,8 +8580,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9006,21 +8592,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -9029,12 +8613,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9044,12 +8626,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9063,8 +8643,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9077,9 +8656,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9096,9 +8673,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9111,22 +8686,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9135,8 +8708,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9148,21 +8720,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -9171,12 +8741,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9186,12 +8754,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9205,8 +8771,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9219,9 +8784,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9236,9 +8799,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9251,22 +8812,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9275,8 +8834,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9287,21 +8845,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -9309,12 +8865,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9322,12 +8876,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9341,8 +8893,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9355,9 +8906,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9372,9 +8921,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9387,22 +8934,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9411,8 +8956,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9423,21 +8967,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -9445,12 +8987,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9458,12 +8998,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9477,8 +9015,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9491,9 +9028,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9510,9 +9045,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9525,22 +9058,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9549,8 +9080,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9562,21 +9092,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -9585,12 +9113,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9600,12 +9126,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9619,8 +9143,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9633,9 +9156,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9652,9 +9173,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9667,22 +9186,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9691,8 +9208,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9704,21 +9220,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -9727,12 +9241,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9742,12 +9254,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9761,8 +9271,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9775,9 +9284,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9794,9 +9301,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9809,22 +9314,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9833,8 +9336,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9846,21 +9348,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -9869,12 +9369,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9884,12 +9382,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9903,8 +9399,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9917,9 +9412,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -9936,9 +9429,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -9951,22 +9442,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9975,8 +9464,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -9988,21 +9476,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10011,12 +9497,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10026,12 +9510,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10045,8 +9527,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10059,9 +9540,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10078,9 +9557,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10093,22 +9570,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10117,8 +9592,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10130,21 +9604,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10153,12 +9625,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10168,12 +9638,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10187,8 +9655,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10201,9 +9668,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10220,9 +9685,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10235,22 +9698,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10259,8 +9720,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10272,21 +9732,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10295,12 +9753,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10310,12 +9766,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10329,8 +9783,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10343,9 +9796,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10362,9 +9813,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10377,22 +9826,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10401,8 +9848,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10414,21 +9860,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10437,12 +9881,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10452,12 +9894,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10471,8 +9911,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10485,9 +9924,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 @@ -10504,9 +9941,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 @@ -10519,22 +9954,20 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s1, s1, 0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10543,8 +9976,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10556,21 +9988,19 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -10579,12 +10009,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10594,12 +10022,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10613,8 +10039,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10631,9 +10056,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10650,9 +10073,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10669,26 +10090,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10699,8 +10118,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10711,11 +10129,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10723,11 +10140,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10735,12 +10151,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10749,12 +10163,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10772,8 +10184,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10790,9 +10201,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10811,9 +10220,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10830,26 +10237,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10860,8 +10265,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -10873,11 +10277,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -10885,11 +10288,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -10898,12 +10300,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10914,12 +10314,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10937,8 +10335,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10955,9 +10352,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -10976,9 +10371,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -10995,26 +10388,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11025,8 +10416,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11038,11 +10428,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11050,11 +10439,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11063,12 +10451,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11079,12 +10465,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11102,8 +10486,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11120,9 +10503,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11143,9 +10524,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11162,26 +10541,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11192,8 +10569,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11206,11 +10582,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11218,11 +10593,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11232,12 +10606,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11250,12 +10622,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11273,8 +10643,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11291,9 +10660,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11314,9 +10681,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11333,26 +10698,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11363,8 +10726,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11377,11 +10739,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11389,11 +10750,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11403,12 +10763,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11421,12 +10779,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11444,8 +10800,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11462,9 +10817,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11483,9 +10836,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11502,26 +10853,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11532,8 +10881,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11545,11 +10893,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11557,11 +10904,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -11570,12 +10916,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11586,12 +10930,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11609,8 +10951,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11627,9 +10968,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11648,9 +10987,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11667,26 +11004,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11697,8 +11032,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11710,11 +11044,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11722,11 +11055,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 @@ -11735,12 +11067,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11751,12 +11081,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11774,8 +11102,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11792,9 +11119,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11815,9 +11140,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -11834,26 +11157,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11864,8 +11185,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -11878,11 +11198,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -11890,11 +11209,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -11904,12 +11222,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11922,12 +11238,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11945,8 +11259,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11963,9 +11276,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -11986,9 +11297,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12005,26 +11314,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12035,8 +11342,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12049,11 +11355,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12061,11 +11366,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12075,12 +11379,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12093,12 +11395,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12116,8 +11416,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12134,9 +11433,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12157,9 +11454,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12176,26 +11471,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12206,8 +11499,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12220,11 +11512,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12232,11 +11523,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12246,12 +11536,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12264,12 +11552,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12287,8 +11573,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12305,9 +11590,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12328,9 +11611,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12347,26 +11628,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12377,8 +11656,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12391,11 +11669,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12403,11 +11680,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12417,12 +11693,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12435,12 +11709,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12458,8 +11730,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12476,9 +11747,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12499,9 +11768,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12518,26 +11785,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12548,8 +11813,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12562,11 +11826,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12574,11 +11837,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12588,12 +11850,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12606,12 +11866,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12629,8 +11887,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12647,9 +11904,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12670,9 +11925,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12689,26 +11942,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12719,8 +11970,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12733,11 +11983,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12745,11 +11994,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12759,12 +12007,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12777,12 +12023,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12800,8 +12044,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12818,9 +12061,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -12841,9 +12082,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -12860,26 +12099,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12890,8 +12127,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -12904,11 +12140,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -12916,11 +12151,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -12930,12 +12164,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12948,12 +12180,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12971,8 +12201,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12989,9 +12218,7 @@ ; ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 @@ -13012,9 +12239,7 @@ ; ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 @@ -13031,26 +12256,24 @@ ; ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 -; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 +; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s0, 16 +; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s1, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s3 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13061,8 +12284,7 @@ ; ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] @@ -13075,11 +12297,10 @@ ; ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 @@ -13087,11 +12308,10 @@ ; ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) @@ -13101,12 +12321,10 @@ ; ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13119,12 +12337,10 @@ ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -2352,20 +2352,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2378,85 +2378,76 @@ ; ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2466,9 +2457,7 @@ ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2485,22 +2474,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2515,14 +2504,12 @@ ; ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -2530,14 +2517,12 @@ ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -2545,70 +2530,65 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2620,9 +2600,7 @@ ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2641,21 +2619,21 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2669,96 +2647,87 @@ ; ; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2770,9 +2739,7 @@ ; ; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2791,23 +2758,23 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2823,16 +2790,14 @@ ; ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -2840,16 +2805,14 @@ ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -2857,77 +2820,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2941,9 +2899,7 @@ ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2964,23 +2920,23 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2996,16 +2952,14 @@ ; ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3013,16 +2967,14 @@ ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3030,77 +2982,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3114,9 +3061,7 @@ ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3137,22 +3082,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3167,14 +3112,12 @@ ; ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3182,14 +3125,12 @@ ; ; GFX10-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3197,70 +3138,65 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3272,9 +3208,7 @@ ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3293,22 +3227,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3323,14 +3257,12 @@ ; ; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3338,14 +3270,12 @@ ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3353,70 +3283,65 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3428,9 +3353,7 @@ ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3449,23 +3372,23 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX6-LABEL: global_agent_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3481,16 +3404,14 @@ ; ; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3498,16 +3419,14 @@ ; ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3515,77 +3434,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3599,9 +3513,7 @@ ; ; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3622,23 +3534,23 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3654,16 +3566,14 @@ ; ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3671,16 +3581,14 @@ ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3688,77 +3596,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3772,9 +3675,7 @@ ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3795,23 +3696,23 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3827,16 +3728,14 @@ ; ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3844,16 +3743,14 @@ ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3861,77 +3758,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3945,9 +3837,7 @@ ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3968,23 +3858,23 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4000,16 +3890,14 @@ ; ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4017,16 +3905,14 @@ ; ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4034,77 +3920,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4118,9 +3999,7 @@ ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4141,23 +4020,23 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4173,16 +4052,14 @@ ; ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4190,16 +4067,14 @@ ; ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4207,77 +4082,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4291,9 +4161,7 @@ ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4314,23 +4182,23 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4346,16 +4214,14 @@ ; ; GFX10-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4363,16 +4229,14 @@ ; ; GFX10-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4380,77 +4244,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4464,9 +4323,7 @@ ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4487,23 +4344,23 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4519,16 +4376,14 @@ ; ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4536,16 +4391,14 @@ ; ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4553,77 +4406,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4637,9 +4485,7 @@ ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4660,23 +4506,23 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4692,16 +4538,14 @@ ; ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4709,16 +4553,14 @@ ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4726,77 +4568,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4810,9 +4647,7 @@ ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4833,22 +4668,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4865,99 +4700,90 @@ ; ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4969,9 +4795,7 @@ ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4992,23 +4816,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5026,107 +4850,98 @@ ; ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5140,9 +4955,7 @@ ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5165,23 +4978,23 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5199,110 +5012,101 @@ ; ; GFX10-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5316,9 +5120,7 @@ ; ; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5341,24 +5143,24 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5377,118 +5179,109 @@ ; ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5504,9 +5297,7 @@ ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5531,24 +5322,24 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5567,118 +5358,109 @@ ; ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5694,9 +5476,7 @@ ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5721,23 +5501,23 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5755,107 +5535,98 @@ ; ; GFX10-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5869,9 +5640,7 @@ ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5894,23 +5663,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5928,107 +5697,98 @@ ; ; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6042,9 +5802,7 @@ ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6067,24 +5825,24 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6103,118 +5861,109 @@ ; ; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6230,9 +5979,7 @@ ; ; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6257,24 +6004,24 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6293,118 +6040,109 @@ ; ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6420,9 +6158,7 @@ ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6447,24 +6183,24 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6483,118 +6219,109 @@ ; ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6610,9 +6337,7 @@ ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6637,24 +6362,24 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6673,118 +6398,109 @@ ; ; GFX10-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6800,9 +6516,7 @@ ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6827,24 +6541,24 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6863,118 +6577,109 @@ ; ; GFX10-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6990,9 +6695,7 @@ ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7017,24 +6720,24 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7053,118 +6756,109 @@ ; ; GFX10-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7180,9 +6874,7 @@ ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7207,24 +6899,24 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7243,118 +6935,109 @@ ; ; GFX10-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7370,9 +7053,7 @@ ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7397,24 +7078,24 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -7433,118 +7114,109 @@ ; ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7560,9 +7232,7 @@ ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9925,20 +9595,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9951,85 +9621,76 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10039,9 +9700,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10058,22 +9717,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10088,14 +9747,12 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10103,14 +9760,12 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10118,70 +9773,65 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10193,9 +9843,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10214,21 +9862,21 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10242,96 +9890,87 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10343,9 +9982,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10364,23 +10001,23 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10396,16 +10033,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10413,16 +10048,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10430,77 +10063,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10514,9 +10142,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10537,23 +10163,23 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10569,16 +10195,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10586,16 +10210,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10603,77 +10225,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10687,9 +10304,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10710,22 +10325,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10740,14 +10355,12 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10755,14 +10368,12 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10770,70 +10381,65 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10845,9 +10451,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10866,22 +10470,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10896,14 +10500,12 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10911,14 +10513,12 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10926,70 +10526,65 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11001,9 +10596,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11022,23 +10615,23 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11054,16 +10647,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11071,16 +10662,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11088,77 +10677,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11172,9 +10756,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11195,23 +10777,23 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11227,16 +10809,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11244,16 +10824,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11261,77 +10839,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11345,9 +10918,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11368,23 +10939,23 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11400,16 +10971,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11417,16 +10986,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11434,77 +11001,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11518,9 +11080,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11541,23 +11101,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11573,16 +11133,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11590,16 +11148,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11607,77 +11163,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11691,9 +11242,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11714,23 +11263,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11746,16 +11295,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11763,16 +11310,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11780,77 +11325,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11864,9 +11404,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11887,23 +11425,23 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11919,16 +11457,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11936,16 +11472,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11953,77 +11487,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12037,9 +11566,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12060,23 +11587,23 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -12092,16 +11619,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -12109,16 +11634,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -12126,77 +11649,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12210,9 +11728,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12233,23 +11749,23 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -12265,16 +11781,14 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -12282,16 +11796,14 @@ ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -12299,77 +11811,72 @@ ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12383,9 +11890,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12406,22 +11911,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12438,99 +11943,90 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12542,9 +12038,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12565,23 +12059,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12599,107 +12093,98 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12713,9 +12198,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12738,24 +12221,24 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12774,118 +12257,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12901,9 +12375,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12928,24 +12400,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12964,118 +12436,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13091,9 +12554,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13118,23 +12579,23 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13152,107 +12613,98 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13266,9 +12718,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13291,23 +12741,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13325,107 +12775,98 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13439,9 +12880,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13464,24 +12903,24 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13500,118 +12939,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13627,9 +13057,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13654,24 +13082,24 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13690,118 +13118,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13817,9 +13236,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13844,24 +13261,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13880,118 +13297,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14007,9 +13415,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14034,24 +13440,24 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14070,118 +13476,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14197,9 +13594,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14224,24 +13619,24 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14260,118 +13655,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14387,9 +13773,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14414,24 +13798,24 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14450,118 +13834,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14577,9 +13952,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14604,24 +13977,24 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14640,118 +14013,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14767,9 +14131,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14794,24 +14156,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14830,118 +14192,109 @@ ; ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14957,9 +14310,7 @@ ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -2087,20 +2087,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2113,85 +2113,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2201,9 +2192,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2220,20 +2209,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2246,85 +2235,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2334,9 +2314,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2353,20 +2331,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2379,85 +2357,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2467,9 +2436,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2486,20 +2453,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2512,85 +2479,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2600,9 +2558,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2619,20 +2575,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2645,85 +2601,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2733,9 +2680,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2752,20 +2697,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2778,85 +2723,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2866,9 +2802,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2885,20 +2819,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2911,85 +2845,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2999,9 +2924,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3018,20 +2941,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3044,85 +2967,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3132,9 +3046,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3151,20 +3063,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3177,85 +3089,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3265,9 +3168,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3284,20 +3185,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3310,85 +3211,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3398,9 +3290,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3417,20 +3307,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3443,85 +3333,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3531,9 +3412,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3550,20 +3429,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3576,85 +3455,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3664,9 +3534,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3683,20 +3551,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3709,85 +3577,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3797,9 +3656,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3816,20 +3673,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3842,85 +3699,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3930,9 +3778,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3949,20 +3795,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3975,85 +3821,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4063,9 +3900,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4082,22 +3917,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4114,99 +3949,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4218,9 +4044,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4241,22 +4065,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4273,99 +4097,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4377,9 +4192,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4400,22 +4213,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4432,99 +4245,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4536,9 +4340,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4559,22 +4361,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4591,99 +4393,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4695,9 +4488,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4718,22 +4509,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4750,99 +4541,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4854,9 +4636,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4877,22 +4657,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4909,99 +4689,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5013,9 +4784,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5036,22 +4805,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5068,99 +4837,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5172,9 +4932,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5195,22 +4953,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5227,99 +4985,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5331,9 +5080,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5354,22 +5101,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5386,99 +5133,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5490,9 +5228,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5513,22 +5249,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5545,99 +5281,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5649,9 +5376,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5672,22 +5397,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5704,99 +5429,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5808,9 +5524,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5831,22 +5545,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5863,99 +5577,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5967,9 +5672,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5990,22 +5693,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6022,99 +5725,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6126,9 +5820,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6149,22 +5841,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6181,99 +5873,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6285,9 +5968,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6308,22 +5989,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6340,99 +6021,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6444,9 +6116,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8540,20 +8210,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8566,85 +8236,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8654,9 +8315,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8673,20 +8332,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8699,85 +8358,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8787,9 +8437,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8806,20 +8454,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8832,85 +8480,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8920,9 +8559,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8939,20 +8576,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8965,85 +8602,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9053,9 +8681,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9072,20 +8698,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9098,85 +8724,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9186,9 +8803,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9205,20 +8820,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9231,85 +8846,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9318,10 +8924,8 @@ ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: -; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9338,20 +8942,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9364,85 +8968,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9452,9 +9047,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9471,20 +9064,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9497,85 +9090,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9585,9 +9169,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9604,20 +9186,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9630,85 +9212,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9718,9 +9291,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9737,20 +9308,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9763,85 +9334,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9851,9 +9413,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9870,20 +9430,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9896,85 +9456,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9984,9 +9535,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10003,20 +9552,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10029,85 +9578,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10117,9 +9657,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10136,20 +9674,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10161,86 +9699,77 @@ ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: -; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10250,9 +9779,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10269,20 +9796,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10295,85 +9822,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10383,9 +9901,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10402,20 +9918,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10428,85 +9944,76 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10516,9 +10023,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10535,22 +10040,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10567,99 +10072,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10671,9 +10167,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10694,22 +10188,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10726,99 +10220,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10830,9 +10315,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10853,22 +10336,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10885,99 +10368,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10989,9 +10463,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11012,22 +10484,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11044,99 +10516,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11148,9 +10611,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11171,22 +10632,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11203,99 +10664,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11307,9 +10759,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11330,22 +10780,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11362,99 +10812,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11466,9 +10907,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11489,22 +10928,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11521,99 +10960,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11625,9 +11055,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11648,22 +11076,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11680,99 +11108,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11784,9 +11203,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11807,22 +11224,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11839,99 +11256,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11943,9 +11351,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11966,22 +11372,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11998,99 +11404,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12102,9 +11499,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12125,22 +11520,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12157,99 +11552,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12261,9 +11647,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12284,22 +11668,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12316,99 +11700,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12420,9 +11795,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12443,22 +11816,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12475,99 +11848,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12579,9 +11943,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12602,22 +11964,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12634,99 +11996,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12738,9 +12091,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12761,22 +12112,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12793,99 +12144,90 @@ ; ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12897,9 +12239,7 @@ ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -2382,20 +2382,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2408,85 +2408,76 @@ ; ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2496,9 +2487,7 @@ ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2515,22 +2504,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2545,14 +2534,12 @@ ; ; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -2560,14 +2547,12 @@ ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -2575,25 +2560,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2601,12 +2586,11 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2614,33 +2598,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2652,9 +2632,7 @@ ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2673,21 +2651,21 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX6-LABEL: global_system_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2701,98 +2679,89 @@ ; ; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2804,9 +2773,7 @@ ; ; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2825,23 +2792,23 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2857,16 +2824,14 @@ ; ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -2874,16 +2839,14 @@ ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -2891,28 +2854,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2920,14 +2883,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -2935,37 +2897,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2979,9 +2937,7 @@ ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3002,23 +2958,23 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3034,16 +2990,14 @@ ; ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3051,16 +3005,14 @@ ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3068,28 +3020,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3097,14 +3049,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3112,37 +3063,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3156,9 +3103,7 @@ ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3179,22 +3124,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3209,14 +3154,12 @@ ; ; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3224,14 +3167,12 @@ ; ; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3239,25 +3180,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3265,12 +3206,11 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3278,33 +3218,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3316,9 +3252,7 @@ ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3337,22 +3271,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3367,14 +3301,12 @@ ; ; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3382,14 +3314,12 @@ ; ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3397,25 +3327,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3423,12 +3353,11 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3436,33 +3365,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3474,9 +3399,7 @@ ; ; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3495,23 +3418,23 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX6-LABEL: global_system_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3527,16 +3450,14 @@ ; ; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3544,16 +3465,14 @@ ; ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3561,28 +3480,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3590,14 +3509,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3605,37 +3523,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3649,9 +3563,7 @@ ; ; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3672,23 +3584,23 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3704,16 +3616,14 @@ ; ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3721,16 +3631,14 @@ ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3738,28 +3646,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3767,14 +3675,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3782,37 +3689,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3826,9 +3729,7 @@ ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3849,23 +3750,23 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3881,16 +3782,14 @@ ; ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -3898,16 +3797,14 @@ ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -3915,28 +3812,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3944,14 +3841,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -3959,37 +3855,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4003,9 +3895,7 @@ ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4026,23 +3916,23 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4058,16 +3948,14 @@ ; ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4075,16 +3963,14 @@ ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4092,28 +3978,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -4121,14 +4007,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -4136,37 +4021,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4180,9 +4061,7 @@ ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4203,22 +4082,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4235,99 +4114,90 @@ ; ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4339,9 +4209,7 @@ ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4362,23 +4230,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4396,109 +4264,100 @@ ; ; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4512,9 +4371,7 @@ ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4537,24 +4394,24 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4573,122 +4430,113 @@ ; ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4704,9 +4552,7 @@ ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4731,24 +4577,24 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4767,122 +4613,113 @@ ; ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4898,9 +4735,7 @@ ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4925,23 +4760,23 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4959,109 +4794,100 @@ ; ; GFX10-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5075,9 +4901,7 @@ ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5100,23 +4924,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5134,109 +4958,100 @@ ; ; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5250,9 +5065,7 @@ ; ; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5275,24 +5088,24 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5311,122 +5124,113 @@ ; ; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5442,9 +5246,7 @@ ; ; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5469,24 +5271,24 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5505,122 +5307,113 @@ ; ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5636,9 +5429,7 @@ ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5663,24 +5454,24 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5699,122 +5490,113 @@ ; ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5830,9 +5612,7 @@ ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5857,24 +5637,24 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5893,122 +5673,113 @@ ; ; GFX10-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6024,9 +5795,7 @@ ; ; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6051,24 +5820,24 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6087,122 +5856,113 @@ ; ; GFX10-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6218,9 +5978,7 @@ ; ; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6245,24 +6003,24 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6281,122 +6039,113 @@ ; ; GFX10-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6412,9 +6161,7 @@ ; ; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6439,24 +6186,24 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6475,122 +6222,113 @@ ; ; GFX10-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6606,9 +6344,7 @@ ; ; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6633,24 +6369,24 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6669,122 +6405,113 @@ ; ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6800,9 +6527,7 @@ ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9195,20 +8920,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9221,85 +8946,76 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9309,9 +9025,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9328,22 +9042,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9358,14 +9072,12 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -9373,14 +9085,12 @@ ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -9388,25 +9098,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -9414,12 +9124,11 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -9427,33 +9136,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9465,9 +9170,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9486,21 +9189,21 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9514,98 +9217,89 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9617,9 +9311,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9638,23 +9330,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9670,16 +9362,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -9687,16 +9377,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -9704,28 +9392,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -9733,14 +9421,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -9748,37 +9435,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9792,9 +9475,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9815,23 +9496,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9847,16 +9528,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -9864,16 +9543,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -9881,28 +9558,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -9910,14 +9587,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -9925,37 +9601,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9969,9 +9641,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9992,22 +9662,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10022,14 +9692,12 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10037,14 +9705,12 @@ ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10052,25 +9718,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10078,12 +9744,11 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10091,33 +9756,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10128,10 +9789,8 @@ ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: -; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10150,22 +9809,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10180,14 +9839,12 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10195,14 +9852,12 @@ ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10210,25 +9865,25 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10236,12 +9891,11 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10249,33 +9903,29 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10287,9 +9937,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10308,23 +9956,23 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10340,16 +9988,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10357,16 +10003,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10374,28 +10018,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10403,14 +10047,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10418,37 +10061,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10462,9 +10101,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10485,23 +10122,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10517,16 +10154,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10534,16 +10169,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10551,28 +10184,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10580,14 +10213,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10595,37 +10227,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10639,9 +10267,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10662,23 +10288,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10694,16 +10320,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10711,16 +10335,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10728,28 +10350,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10757,14 +10379,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10772,37 +10393,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10816,9 +10433,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10839,23 +10454,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10871,16 +10486,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -10888,16 +10501,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -10905,28 +10516,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10934,14 +10545,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -10949,37 +10559,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10993,9 +10599,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11016,23 +10620,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11048,16 +10652,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11065,16 +10667,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11082,28 +10682,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11111,14 +10711,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11126,37 +10725,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11170,9 +10765,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11193,23 +10786,23 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11224,17 +10817,15 @@ ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: -; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11242,16 +10833,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11259,28 +10848,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11288,14 +10877,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11303,37 +10891,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11347,9 +10931,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11370,23 +10952,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11402,16 +10984,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11419,16 +10999,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11436,28 +11014,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11465,14 +11043,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11480,37 +11057,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11524,9 +11097,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11547,23 +11118,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11579,16 +11150,14 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -11596,16 +11165,14 @@ ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -11613,28 +11180,28 @@ ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11642,14 +11209,13 @@ ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol @@ -11657,37 +11223,33 @@ ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11701,9 +11263,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11724,22 +11284,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11756,99 +11316,90 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11860,9 +11411,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11883,23 +11432,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11917,109 +11466,100 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12033,9 +11573,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12058,23 +11596,23 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12092,112 +11630,103 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12211,9 +11740,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12236,24 +11763,24 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12272,122 +11799,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12403,9 +11921,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12430,24 +11946,24 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12466,122 +11982,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12597,9 +12104,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12624,23 +12129,23 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12658,109 +12163,100 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12774,9 +12270,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12799,23 +12293,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12833,109 +12327,100 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12949,9 +12434,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12974,24 +12457,24 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13010,122 +12493,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13141,9 +12615,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13168,24 +12640,24 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13204,122 +12676,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13335,9 +12798,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13362,24 +12823,24 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13398,122 +12859,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13529,9 +12981,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13556,24 +13006,24 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13592,122 +13042,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13723,9 +13164,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13750,24 +13189,24 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13786,122 +13225,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13917,9 +13347,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13944,24 +13372,24 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13980,122 +13408,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14111,9 +13530,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14138,24 +13555,24 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14174,122 +13591,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14305,9 +13713,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14332,24 +13738,24 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -14368,122 +13774,113 @@ ; ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2 ; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: buffer_wbl2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_invl2 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 sc1 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 sc1 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -14499,9 +13896,7 @@ ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -2087,20 +2087,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2113,85 +2113,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2201,9 +2192,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2220,20 +2209,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2246,85 +2235,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2334,9 +2314,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2353,20 +2331,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2379,85 +2357,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2467,9 +2436,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2486,20 +2453,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2512,85 +2479,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2600,9 +2558,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2619,20 +2575,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2645,85 +2601,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2733,9 +2680,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2752,20 +2697,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2778,85 +2723,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2866,9 +2802,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2885,20 +2819,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2911,85 +2845,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2999,9 +2924,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3018,20 +2941,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3044,85 +2967,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3132,9 +3046,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3151,20 +3063,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3177,85 +3089,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3265,9 +3168,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3284,20 +3185,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3310,85 +3211,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3398,9 +3290,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3417,20 +3307,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3443,85 +3333,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3531,9 +3412,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3550,20 +3429,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3576,85 +3455,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3664,9 +3534,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3683,20 +3551,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3709,85 +3577,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3797,9 +3656,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3816,20 +3673,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3842,85 +3699,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3930,9 +3778,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3949,20 +3795,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3975,85 +3821,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4063,9 +3900,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4082,22 +3917,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4114,99 +3949,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4218,9 +4044,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4241,22 +4065,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4273,99 +4097,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4377,9 +4192,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4400,22 +4213,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4432,99 +4245,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4536,9 +4340,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4559,22 +4361,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4591,99 +4393,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4695,9 +4488,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4718,22 +4509,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4750,99 +4541,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4854,9 +4636,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4877,22 +4657,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4909,99 +4689,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5013,9 +4784,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5036,22 +4805,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5068,99 +4837,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5172,9 +4932,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5195,22 +4953,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5227,99 +4985,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5331,9 +5080,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5354,22 +5101,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5386,99 +5133,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5490,9 +5228,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5513,22 +5249,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5545,99 +5281,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5649,9 +5376,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5672,22 +5397,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5704,99 +5429,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5808,9 +5524,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5831,22 +5545,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5863,99 +5577,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5967,9 +5672,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5990,22 +5693,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6022,99 +5725,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6126,9 +5820,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6149,22 +5841,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6181,99 +5873,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6285,9 +5968,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6308,22 +5989,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6340,99 +6021,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6444,9 +6116,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8540,20 +8210,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8566,85 +8236,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8654,9 +8315,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8673,20 +8332,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8699,85 +8358,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8787,9 +8437,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8806,20 +8454,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8832,85 +8480,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8920,9 +8559,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -8939,20 +8576,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -8965,85 +8602,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9053,9 +8681,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9072,20 +8698,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9098,85 +8724,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9186,9 +8803,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9205,20 +8820,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9231,85 +8846,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9318,10 +8924,8 @@ ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: -; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9338,20 +8942,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9364,85 +8968,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9452,9 +9047,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9471,20 +9064,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9497,85 +9090,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9585,9 +9169,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9604,20 +9186,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9630,85 +9212,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9718,9 +9291,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9737,20 +9308,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9763,85 +9334,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9851,9 +9413,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9870,20 +9430,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9896,85 +9456,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9984,9 +9535,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10003,20 +9552,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10029,85 +9578,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10117,9 +9657,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10136,20 +9674,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10161,86 +9699,77 @@ ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: -; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10250,9 +9779,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10269,20 +9796,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10295,85 +9822,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10383,9 +9901,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10402,20 +9918,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10428,85 +9944,76 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10516,9 +10023,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10535,22 +10040,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10567,99 +10072,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10671,9 +10167,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10694,22 +10188,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10726,99 +10220,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10830,9 +10315,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10853,22 +10336,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -10885,99 +10368,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10989,9 +10463,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11012,22 +10484,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11044,99 +10516,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11148,9 +10611,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11171,22 +10632,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11203,99 +10664,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11307,9 +10759,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11330,22 +10780,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11362,99 +10812,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11466,9 +10907,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11489,22 +10928,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11521,99 +10960,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11625,9 +11055,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11648,22 +11076,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11680,99 +11108,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11784,9 +11203,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11807,22 +11224,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11839,99 +11256,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11943,9 +11351,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11966,22 +11372,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11998,99 +11404,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12102,9 +11499,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12125,22 +11520,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12157,99 +11552,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12261,9 +11647,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12284,22 +11668,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12316,99 +11700,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12420,9 +11795,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12443,22 +11816,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12475,99 +11848,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12579,9 +11943,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12602,22 +11964,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12634,99 +11996,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12738,9 +12091,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12761,22 +12112,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12793,99 +12144,90 @@ ; ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12897,9 +12239,7 @@ ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -2224,20 +2224,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2250,85 +2250,76 @@ ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2338,9 +2329,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2357,20 +2346,20 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2383,91 +2372,82 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2478,9 +2458,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2497,21 +2475,21 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2525,93 +2503,84 @@ ; ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2623,9 +2592,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2643,21 +2610,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2671,99 +2638,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2776,9 +2734,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2796,21 +2752,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2824,99 +2780,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2929,9 +2876,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -2949,20 +2894,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -2975,91 +2920,82 @@ ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3070,9 +3006,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3089,20 +3023,20 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3115,91 +3049,82 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3210,9 +3135,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3229,21 +3152,21 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3257,99 +3180,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3362,9 +3276,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3382,21 +3294,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3410,99 +3322,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3515,9 +3418,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3535,21 +3436,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3563,99 +3464,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3668,9 +3560,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3688,21 +3578,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3716,99 +3606,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3821,9 +3702,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3841,21 +3720,21 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -3869,99 +3748,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3974,9 +3844,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -3994,21 +3862,21 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4022,99 +3890,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4127,9 +3986,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4147,21 +4004,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4175,99 +4032,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4280,9 +4128,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4300,21 +4146,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -4328,99 +4174,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4433,9 +4270,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4453,22 +4288,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4485,99 +4320,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4589,9 +4415,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4612,22 +4436,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4644,102 +4468,93 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4752,9 +4567,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4775,23 +4588,23 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4809,107 +4622,98 @@ ; ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4923,9 +4727,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -4947,23 +4749,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -4981,110 +4783,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5099,9 +4892,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5123,23 +4914,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5157,110 +4948,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5275,9 +5057,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5299,22 +5079,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5331,102 +5111,93 @@ ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5439,9 +5210,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5462,22 +5231,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5494,102 +5263,93 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5602,9 +5362,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5625,23 +5383,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5659,110 +5417,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5777,9 +5526,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5801,23 +5548,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -5835,110 +5582,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5953,9 +5691,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -5977,23 +5713,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6011,110 +5747,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6129,9 +5856,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6153,23 +5878,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6187,110 +5912,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6305,9 +6021,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6329,23 +6043,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6363,110 +6077,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6481,9 +6186,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6505,23 +6208,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6539,110 +6242,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6657,9 +6351,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6681,23 +6373,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6715,110 +6407,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6833,9 +6516,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -6857,23 +6538,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -6891,110 +6572,101 @@ ; ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -7009,9 +6681,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9191,20 +8861,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9217,85 +8887,76 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9305,9 +8966,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9324,20 +8983,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9350,91 +9009,82 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9445,9 +9095,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9464,20 +9112,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9490,89 +9138,80 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9584,9 +9223,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9603,20 +9240,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9629,95 +9266,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9730,9 +9358,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9749,20 +9375,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9775,95 +9401,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9876,9 +9493,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -9895,20 +9510,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -9921,91 +9536,82 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10015,10 +9621,8 @@ ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU: ; %bb.0: ; %entry +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10035,20 +9639,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10061,91 +9665,82 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10156,9 +9751,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10175,20 +9768,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10201,95 +9794,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10302,9 +9886,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10321,20 +9903,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10347,95 +9929,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10448,9 +10021,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10467,20 +10038,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10493,95 +10064,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10594,9 +10156,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10613,20 +10173,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10639,95 +10199,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10740,9 +10291,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10759,20 +10308,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10785,95 +10334,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10886,9 +10426,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -10905,20 +10443,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -10930,96 +10468,87 @@ ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11032,9 +10561,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11051,20 +10578,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11077,95 +10604,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11178,9 +10696,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11197,20 +10713,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 @@ -11223,95 +10739,86 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11324,9 +10831,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11343,22 +10848,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11375,99 +10880,90 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11479,9 +10975,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11502,22 +10996,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11534,102 +11028,93 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11642,9 +11127,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11665,22 +11148,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11697,103 +11180,94 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11807,9 +11281,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11830,22 +11302,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -11862,106 +11334,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11976,9 +11439,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -11999,22 +11460,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12031,106 +11492,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12145,9 +11597,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12168,22 +11618,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12200,102 +11650,93 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12308,9 +11749,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12331,22 +11770,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12363,102 +11802,93 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12471,9 +11901,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12494,22 +11922,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12526,106 +11954,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12640,9 +12059,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12663,22 +12080,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12695,106 +12112,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12809,9 +12217,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12832,22 +12238,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -12864,106 +12270,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -12978,9 +12375,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13001,22 +12396,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13033,106 +12428,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13147,9 +12533,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13170,22 +12554,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13202,106 +12586,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13316,9 +12691,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13339,22 +12712,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13371,106 +12744,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13485,9 +12849,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13508,22 +12870,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13540,106 +12902,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13654,9 +13007,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13677,22 +13028,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 -; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX6-NEXT: s_mov_b32 s7, 0x100f000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s7 -; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 @@ -13709,106 +13060,97 @@ ; ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry -; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry -; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; SKIP-CACHE-INV: ; %bb.0: ; %entry -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2 -; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000 -; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1 +; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 -; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0 +; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s1 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) -; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SKIP-CACHE-INV-NEXT: s_endpgm ; ; GFX90A-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] -; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] +; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX90A-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX90A-TGSPLIT: ; %bb.0: ; %entry -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc +; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol -; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[2:3] +; GFX90A-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-TGSPLIT-NEXT: s_endpgm ; ; GFX940-NOTTGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-NOTTGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-NOTTGSPLIT-NEXT: s_endpgm ; ; GFX940-TGSPLIT-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX940-TGSPLIT: ; %bb.0: ; %entry -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 -; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) -; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 sc0 +; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 sc0 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) ; GFX940-TGSPLIT-NEXT: buffer_inv sc0 -; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5] +; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[0:1] ; GFX940-TGSPLIT-NEXT: s_endpgm ; ; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-WGP: ; %bb.0: ; %entry -; GFX11-WGP-NEXT: s_clause 0x1 -; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 @@ -13823,9 +13165,7 @@ ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX11-CU: ; %bb.0: ; %entry -; GFX11-CU-NEXT: s_clause 0x1 -; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 -; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -14,8 +14,7 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +143,7 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX6-LABEL: local_agent_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -274,8 +272,7 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX6-LABEL: local_agent_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -408,8 +405,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX6-LABEL: local_agent_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -555,12 +551,11 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX6-LABEL: local_agent_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -662,12 +657,11 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX6-LABEL: local_agent_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -769,12 +763,11 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX6-LABEL: local_agent_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -889,12 +882,11 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX6-LABEL: local_agent_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1009,8 +1001,7 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX6-LABEL: local_agent_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1116,8 +1107,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX6-LABEL: local_agent_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1236,8 +1226,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX6-LABEL: local_agent_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1356,8 +1345,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX6-LABEL: local_agent_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1489,8 +1477,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX6-LABEL: local_agent_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1622,8 +1609,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX6-LABEL: local_agent_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1756,8 +1742,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_agent_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1903,8 +1888,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_agent_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6487,8 +6471,7 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX6-LABEL: local_agent_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6617,8 +6600,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX6-LABEL: local_agent_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6747,8 +6729,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX6-LABEL: local_agent_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6877,8 +6858,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX6-LABEL: local_agent_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7007,12 +6987,11 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX6-LABEL: local_agent_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7114,12 +7093,11 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX6-LABEL: local_agent_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7221,12 +7199,11 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX6-LABEL: local_agent_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7328,12 +7305,11 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX6-LABEL: local_agent_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7435,8 +7411,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7542,8 +7517,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7649,8 +7623,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX6-LABEL: local_agent_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7756,8 +7729,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7863,8 +7835,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7970,8 +7941,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -8100,8 +8070,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -8230,8 +8199,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -14,8 +14,7 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +143,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX6-LABEL: local_singlethread_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -274,8 +272,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX6-LABEL: local_singlethread_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -404,8 +401,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX6-LABEL: local_singlethread_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -534,12 +530,11 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX6-LABEL: local_singlethread_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -641,12 +636,11 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX6-LABEL: local_singlethread_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -748,12 +742,11 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX6-LABEL: local_singlethread_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -855,12 +848,11 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX6-LABEL: local_singlethread_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -962,8 +954,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX6-LABEL: local_singlethread_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1069,8 +1060,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX6-LABEL: local_singlethread_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1176,8 +1166,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX6-LABEL: local_singlethread_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1283,8 +1272,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX6-LABEL: local_singlethread_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1390,8 +1378,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX6-LABEL: local_singlethread_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1497,8 +1484,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1627,8 +1613,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1757,8 +1742,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -5817,8 +5801,7 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX6-LABEL: local_singlethread_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -5947,8 +5930,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX6-LABEL: local_singlethread_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6077,8 +6059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX6-LABEL: local_singlethread_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6207,8 +6188,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6337,12 +6317,11 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX6-LABEL: local_singlethread_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6444,12 +6423,11 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX6-LABEL: local_singlethread_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6551,12 +6529,11 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX6-LABEL: local_singlethread_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6658,12 +6635,11 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6765,8 +6741,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6872,8 +6847,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6979,8 +6953,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7086,8 +7059,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7193,8 +7165,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7300,8 +7271,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7430,8 +7400,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7560,8 +7529,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -14,8 +14,7 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +143,7 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX6-LABEL: local_system_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -274,8 +272,7 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX6-LABEL: local_system_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -408,8 +405,7 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX6-LABEL: local_system_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -555,12 +551,11 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX6-LABEL: local_system_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -662,12 +657,11 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX6-LABEL: local_system_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -769,12 +763,11 @@ define amdgpu_kernel void @local_system_release_store( ; GFX6-LABEL: local_system_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -889,12 +882,11 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX6-LABEL: local_system_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1009,8 +1001,7 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX6-LABEL: local_system_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1116,8 +1107,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX6-LABEL: local_system_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1236,8 +1226,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX6-LABEL: local_system_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1356,8 +1345,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX6-LABEL: local_system_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1489,8 +1477,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX6-LABEL: local_system_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1622,8 +1609,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX6-LABEL: local_system_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1756,8 +1742,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_system_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1903,8 +1888,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_system_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6487,8 +6471,7 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX6-LABEL: local_system_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6617,8 +6600,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX6-LABEL: local_system_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6747,8 +6729,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX6-LABEL: local_system_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6877,8 +6858,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX6-LABEL: local_system_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7007,12 +6987,11 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX6-LABEL: local_system_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7114,12 +7093,11 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX6-LABEL: local_system_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7221,12 +7199,11 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX6-LABEL: local_system_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7328,12 +7305,11 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX6-LABEL: local_system_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7435,8 +7411,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7542,8 +7517,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_system_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7649,8 +7623,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX6-LABEL: local_system_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7756,8 +7729,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7863,8 +7835,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7970,8 +7941,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -8100,8 +8070,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -8230,8 +8199,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -418,14 +418,13 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX6-LABEL: local_volatile_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v1, v0 ; GFX6-NEXT: s_endpgm ; @@ -506,11 +505,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX6-LABEL: local_volatile_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s2, s[0:1], 0xa -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -14,8 +14,7 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +143,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX6-LABEL: local_wavefront_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -274,8 +272,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX6-LABEL: local_wavefront_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -404,8 +401,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX6-LABEL: local_wavefront_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -534,12 +530,11 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX6-LABEL: local_wavefront_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -641,12 +636,11 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX6-LABEL: local_wavefront_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -748,12 +742,11 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX6-LABEL: local_wavefront_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -855,12 +848,11 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX6-LABEL: local_wavefront_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -962,8 +954,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX6-LABEL: local_wavefront_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1069,8 +1060,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX6-LABEL: local_wavefront_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1176,8 +1166,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX6-LABEL: local_wavefront_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1283,8 +1272,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX6-LABEL: local_wavefront_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1390,8 +1378,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX6-LABEL: local_wavefront_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1497,8 +1484,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1627,8 +1613,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1757,8 +1742,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -5817,8 +5801,7 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX6-LABEL: local_wavefront_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -5947,8 +5930,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX6-LABEL: local_wavefront_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6077,8 +6059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX6-LABEL: local_wavefront_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6207,8 +6188,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6337,12 +6317,11 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX6-LABEL: local_wavefront_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6444,12 +6423,11 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX6-LABEL: local_wavefront_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6551,12 +6529,11 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX6-LABEL: local_wavefront_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6658,12 +6635,11 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -6765,8 +6741,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6872,8 +6847,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6979,8 +6953,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7086,8 +7059,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7193,8 +7165,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7300,8 +7271,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7430,8 +7400,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7560,8 +7529,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -14,8 +14,7 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX6-LABEL: local_workgroup_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -144,8 +143,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; GFX6-LABEL: local_workgroup_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -274,8 +272,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX6-LABEL: local_workgroup_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -408,8 +405,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX6-LABEL: local_workgroup_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -555,12 +551,11 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX6-LABEL: local_workgroup_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -662,12 +657,11 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX6-LABEL: local_workgroup_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -769,12 +763,11 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX6-LABEL: local_workgroup_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -889,12 +882,11 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX6-LABEL: local_workgroup_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm @@ -1009,8 +1001,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX6-LABEL: local_workgroup_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1116,8 +1107,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX6-LABEL: local_workgroup_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1236,8 +1226,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX6-LABEL: local_workgroup_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1356,8 +1345,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1489,8 +1477,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1622,8 +1609,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1756,8 +1742,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -1903,8 +1888,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6487,8 +6471,7 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; GFX6-LABEL: local_workgroup_one_as_unordered_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6617,8 +6600,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; GFX6-LABEL: local_workgroup_one_as_monotonic_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6747,8 +6729,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX6-LABEL: local_workgroup_one_as_acquire_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -6877,8 +6858,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_load: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7007,12 +6987,11 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX6-LABEL: local_workgroup_one_as_unordered_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7114,12 +7093,11 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX6-LABEL: local_workgroup_one_as_monotonic_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7221,12 +7199,11 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX6-LABEL: local_workgroup_one_as_release_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7328,12 +7305,11 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x1 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -7435,8 +7411,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7542,8 +7517,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7649,8 +7623,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7756,8 +7729,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7863,8 +7835,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -7970,8 +7941,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -8100,8 +8070,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -8230,8 +8199,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX6-NEXT: s_load_dword s1, s[4:5], 0x1 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,43 +5,41 @@ define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[4:5] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[4:5] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(3) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(2) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -71,11 +69,10 @@ define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x2c +; GCN-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -93,20 +90,18 @@ ; GCN-NEXT: v_mov_b32_e32 v13, s13 ; GCN-NEXT: v_mov_b32_e32 v14, s14 ; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[18:19] +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[18:19] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[18:19] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[18:19] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: scalar_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_clause 0x1 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-SCRATCH-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x2c +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) -; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[2:3], 0x0 +; GCN-SCRATCH-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s0 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s1 @@ -124,10 +119,10 @@ ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v13, s13 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v14, s14 ; GCN-SCRATCH-NEXT: v_mov_b32_e32 v15, s15 -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[18:19] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[18:19] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[18:19] offset:32 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[18:19] offset:48 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -107,8 +107,7 @@ } ; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: -; GCN: s_load_dwordx2 s -; GCN: s_load_dwordx2 s +; GCN: s_load_dwordx4 s ; CI: s_ashr_i32 ; CI: s_sext_i32_i16 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -9,39 +9,42 @@ define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_i32 s2, s4, 0x180000 -; SI-NEXT: s_bfe_i32 s4, s5, 0x180000 -; SI-NEXT: s_mul_i32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_bfe_i32 s2, s2, 0x180000 +; SI-NEXT: s_bfe_i32 s3, s3, 0x180000 +; SI-NEXT: s_mul_i32 s2, s2, s3 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; VI-NEXT: s_bfe_i32 s5, s5, 0x180000 -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_bfe_i32 s0, s2, 0x180000 +; VI-NEXT: s_bfe_i32 s1, s3, 0x180000 +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 ; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 ; GFX9-NEXT: s_mul_i32 s0, s0, s1 @@ -97,35 +100,38 @@ define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { ; SI-LABEL: test_smulhi24_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s4, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 ; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 ; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -9,39 +9,42 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; SI-LABEL: test_umul24_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s4, 0xffffff -; SI-NEXT: s_and_b32 s4, s5, 0xffffff -; SI-NEXT: s_mul_i32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_and_b32 s2, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s3, 0xffffff +; SI-NEXT: s_mul_i32 s2, s2, s3 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, 0xffffff -; VI-NEXT: s_and_b32 s5, s5, 0xffffff -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_and_b32 s0, s2, 0xffffff +; VI-NEXT: s_and_b32 s1, s3, 0xffffff +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff ; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 @@ -376,35 +379,38 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi24_i32_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s5 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s4, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff ; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff ; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 @@ -628,45 +634,46 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) { ; SI-LABEL: test_umulhi16_i32: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s4, 0xffff -; SI-NEXT: s_and_b32 s4, s5, 0xffff -; SI-NEXT: s_mul_i32 s2, s2, s4 -; SI-NEXT: s_lshr_b32 s4, s2, 16 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_and_b32 s2, s2, 0xffff +; SI-NEXT: s_and_b32 s3, s3, 0xffff +; SI-NEXT: s_mul_i32 s2, s2, s3 +; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_and_b32 s0, s2, 0xffff +; VI-NEXT: s_and_b32 s1, s3, 0xffff +; VI-NEXT: s_mul_i32 s0, s0, s1 +; VI-NEXT: s_lshr_b32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_mul_i32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -59,7 +59,8 @@ ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 ; SI: s_endpgm -define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { +; Note: dummy argument here to prevent combining of descriptor loads for %out and %b +define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, i32 %dummy, <2 x i32> %b) { %add = add <2 x i32> %b, store <2 x i32> %add, <2 x i32> addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -7,70 +7,66 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, <8 x i16>* %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8i16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_pack_lh_b32_b16 s4, s0, s0 -; GFX900-NEXT: v_mov_b32_e32 v6, s3 -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, s1 -; GFX900-NEXT: v_mov_b32_e32 v4, s0 -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: v_mov_b32_e32 v3, s4 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX900-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX900-NEXT: v_mov_b32_e32 v5, s3 +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s0 +; GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GFX900-NEXT: v_mov_b32_e32 v2, s4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX900-NEXT: s_endpgm ; ; GFX906-LABEL: scalar_to_vector_v8i16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: s_pack_lh_b32_b16 s4, s0, s0 -; GFX906-NEXT: v_mov_b32_e32 v6, s3 -; GFX906-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, s1 -; GFX906-NEXT: v_mov_b32_e32 v4, s0 -; GFX906-NEXT: v_mov_b32_e32 v1, s4 -; GFX906-NEXT: v_mov_b32_e32 v3, s4 -; GFX906-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX906-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX906-NEXT: v_mov_b32_e32 v5, s3 +; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX906-NEXT: v_mov_b32_e32 v1, s1 +; GFX906-NEXT: v_mov_b32_e32 v3, s0 +; GFX906-NEXT: v_mov_b32_e32 v0, s4 +; GFX906-NEXT: v_mov_b32_e32 v2, s4 +; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX906-NEXT: s_endpgm ; ; GFX908-LABEL: scalar_to_vector_v8i16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_pack_lh_b32_b16 s4, s0, s0 -; GFX908-NEXT: v_mov_b32_e32 v6, s3 -; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s1 -; GFX908-NEXT: v_mov_b32_e32 v4, s0 -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: v_mov_b32_e32 v3, s4 -; GFX908-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX908-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX908-NEXT: v_mov_b32_e32 v5, s3 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v3, s0 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v2, s4 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: scalar_to_vector_v8i16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s0, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-NEXT: v_mov_b32_e32 v5, s0 +; GFX90A-NEXT: v_mov_b32_e32 v5, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 -; GFX90A-NEXT: v_mov_b32_e32 v4, s4 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX90A-NEXT: s_endpgm entry: %val.1.i32 = extractelement <2 x i32> %in, i64 0 @@ -93,66 +89,62 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* %out) #0 { ; GFX900-LABEL: scalar_to_vector_v8f16: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX900-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s0 -; GFX900-NEXT: v_mov_b32_e32 v6, s3 -; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, s1 -; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: v_mov_b32_e32 v5, s3 +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX900-NEXT: v_mov_b32_e32 v0, s0 +; GFX900-NEXT: v_mov_b32_e32 v1, s1 ; GFX900-NEXT: v_mov_b32_e32 v3, s0 -; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX900-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX900-NEXT: s_endpgm ; ; GFX906-LABEL: scalar_to_vector_v8f16: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX906-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v1, s0 -; GFX906-NEXT: v_mov_b32_e32 v6, s3 -; GFX906-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX906-NEXT: v_mov_b32_e32 v2, s1 -; GFX906-NEXT: v_mov_b32_e32 v4, s0 +; GFX906-NEXT: v_mov_b32_e32 v5, s3 +; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX906-NEXT: v_mov_b32_e32 v0, s0 +; GFX906-NEXT: v_mov_b32_e32 v1, s1 ; GFX906-NEXT: v_mov_b32_e32 v3, s0 -; GFX906-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX906-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX906-NEXT: v_mov_b32_e32 v2, s0 +; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX906-NEXT: s_endpgm ; ; GFX908-LABEL: scalar_to_vector_v8f16: ; GFX908: ; %bb.0: ; %entry -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s0 -; GFX908-NEXT: v_mov_b32_e32 v6, s3 -; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, s1 -; GFX908-NEXT: v_mov_b32_e32 v4, s0 +; GFX908-NEXT: v_mov_b32_e32 v5, s3 +; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, s0 +; GFX908-NEXT: v_mov_b32_e32 v1, s1 ; GFX908-NEXT: v_mov_b32_e32 v3, s0 -; GFX908-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX908-NEXT: flat_store_dwordx4 v[5:6], v[1:4] +; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: scalar_to_vector_v8f16: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v5, s3 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: v_mov_b32_e32 v1, s3 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, s1 -; GFX90A-NEXT: v_mov_b32_e32 v5, s0 -; GFX90A-NEXT: v_mov_b32_e32 v4, s0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX90A-NEXT: s_endpgm entry: %val.1.float = extractelement <2 x float> %in, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -942,26 +942,25 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_sdiv24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-NEXT: s_load_dword s8, s[0:1], 0xe -; GCN-NEXT: s_load_dword s0, s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_sext_i32_i16 s1, s3 -; GCN-NEXT: s_sext_i32_i16 s3, s8 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_sext_i32_i16 s5, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 24 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_sext_i32_i16 s4, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_alignbit_b32 v2, s4, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 ; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v3, -v2, v1, v3 @@ -971,32 +970,29 @@ ; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 +; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 24 +; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[2:3], 24 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 -; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] -; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_sub_u32 s12, s6, s2 -; GCN-IR-NEXT: s_subb_u32 s13, s7, s2 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s4 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s4 +; GCN-IR-NEXT: s_sub_u32 s12, s6, s0 +; GCN-IR-NEXT: s_subb_u32 s13, s7, s0 +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[10:11] +; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 @@ -1069,16 +1065,16 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] ; GCN-IR-NEXT: .LBB9_6: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] -; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -6,12 +6,13 @@ define amdgpu_kernel void @select_constant_cttz(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind { ; GCN-LABEL: select_constant_cttz: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshr_b32 s0, 1, s2 ; GCN-NEXT: s_ff1_i32_b32 s0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll --- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll +++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll @@ -79,9 +79,7 @@ } ; GCN-LABEL: {{^}}select_v2i16: -; GFX89: s_load_dword -; GFX89: s_load_dword -; GFX89: s_load_dword +; GFX89: s_load_dwordx4 ; GFX89: s_cselect_b32 ; GFX89-NOT: s_cselect_b32 diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -4,15 +4,16 @@ define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) { ; GCN-LABEL: sext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s4, s4 -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_sext_i32_i16 s0, s2 +; GCN-NEXT: s_add_i32 s0, s3, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %sext = sext i16 %a to i32 %res = add i32 %b, %sext diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,74 +8,77 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s4, 16 -; VI-NEXT: s_lshr_b32 s7, s5, 16 -; VI-NEXT: s_lshl_b32 s6, s6, s7 -; VI-NEXT: s_lshl_b32 s4, s4, s5 -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_lshl_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s6, s4, 16 -; CI-NEXT: s_lshr_b32 s7, s5, 16 -; CI-NEXT: s_lshl_b32 s6, s6, s7 -; CI-NEXT: s_lshl_b32 s4, s4, s5 -; CI-NEXT: s_lshl_b32 s6, s6, 16 -; CI-NEXT: s_and_b32 s4, s4, 0xffff -; CI-NEXT: s_or_b32 s4, s4, s6 -; CI-NEXT: v_mov_b32_e32 v0, s4 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: s_mov_b32 s4, s0 +; CI-NEXT: s_mov_b32 s5, s1 +; CI-NEXT: s_lshr_b32 s0, s2, 16 +; CI-NEXT: s_lshr_b32 s1, s3, 16 +; CI-NEXT: s_lshl_b32 s0, s0, s1 +; CI-NEXT: s_lshl_b32 s1, s2, s3 +; CI-NEXT: s_lshl_b32 s0, s0, 16 +; CI-NEXT: s_and_b32 s1, s1, 0xffff +; CI-NEXT: s_or_b32 s0, s1, s0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = shl <2 x i16> %lhs, %rhs diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -5,28 +5,30 @@ define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, s5 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i32 @@ -37,14 +39,15 @@ define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: test_s_sext_i32_to_i64: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s6, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mul_i32 s4, s4, s5 -; SI-NEXT: s_add_i32 s4, s4, s6 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mul_i32 s4, s6, s7 +; SI-NEXT: s_add_i32 s4, s4, s8 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_ashr_i32 s5, s4, 31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -53,14 +56,15 @@ ; ; VI-LABEL: test_s_sext_i32_to_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s6, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: s_add_i32 s4, s4, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mul_i32 s4, s6, s7 +; VI-NEXT: s_add_i32 s4, s4, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_ashr_i32 s5, s4, 31 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 @@ -77,30 +81,32 @@ define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, s5 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i64 @@ -215,28 +221,30 @@ define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { ; SI-LABEL: s_sext_i1_to_i16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s4, s5 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: s_cmp_eq_u32 s2, s3 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s4, s5 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: s_cmp_eq_u32 s2, s3 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i16 @@ -291,14 +299,15 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { ; SI-LABEL: v_sext_i1_to_i16_with_and: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; SI-NEXT: s_load_dword s6, s[0:1], 0xd -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_load_dword s8, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; SI-NEXT: s_cmp_eq_u32 s5, s6 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_cmp_eq_u32 s7, s8 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] @@ -307,14 +316,15 @@ ; ; VI-LABEL: v_sext_i1_to_i16_with_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dword s6, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s8, s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; VI-NEXT: s_cmp_eq_u32 s5, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_cmp_eq_u32 s7, s8 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -106,11 +106,11 @@ } ; GCN-LABEL: {{^}}s_abs_v4i16: -; GFX9: s_load_dwordx2 s[[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]], s[0:1], 0x2c -; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]] -; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]] -; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]] -; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]] +; GFX9: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s[0:1], 0x24 +; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[#LOAD + 2]] +; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[#LOAD + 3]] +; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[#LOAD + 2]], [[SUB0]] +; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[#LOAD + 3]], [[SUB1]] ; GFX9-DAG: v_pk_sub_u16 [[ADD0:v[0-9]+]], [[MAX0]], -2 op_sel_hi:[1,0] ; GFX9-DAG: v_pk_sub_u16 [[ADD1:v[0-9]+]], [[MAX1]], -2 op_sel_hi:[1,0] define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1106,19 +1106,18 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xc -; GCN-NEXT: s_load_dword s3, s[0:1], 0xe -; GCN-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s1, s2 -; GCN-NEXT: s_sext_i32_i16 s2, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_alignbit_b32 v0, s2, v0, 24 +; GCN-NEXT: s_sext_i32_i16 s7, s7 +; GCN-NEXT: s_sext_i32_i16 s1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 24 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_alignbit_b32 v2, s7, v2, 24 ; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GCN-NEXT: v_xor_b32_e32 v5, v2, v0 @@ -1130,50 +1129,47 @@ ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s7, s7 ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 -; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 -; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 -; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s4, s4, s2 -; GCN-IR-NEXT: s_subb_u32 s5, s5, s2 -; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 +; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 +; GCN-IR-NEXT: s_ashr_i32 s12, s3, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[6:7], 24 +; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[2:3], 24 +; GCN-IR-NEXT: s_mov_b32 s1, s0 +; GCN-IR-NEXT: s_mov_b32 s13, s12 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[0:1] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[10:11], s[12:13] +; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s12 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s12 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 ; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[12:13] ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 ; GCN-IR-NEXT: s_min_u32 s12, s10, s11 -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s4 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s5 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s3 ; GCN-IR-NEXT: s_min_u32 s16, s10, s11 ; GCN-IR-NEXT: s_sub_u32 s10, s12, s16 ; GCN-IR-NEXT: s_subb_u32 s11, 0, 0 @@ -1191,10 +1187,10 @@ ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 ; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 +; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] @@ -1230,29 +1226,29 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_branch .LBB9_6 ; GCN-IR-NEXT: .LBB9_5: -; GCN-IR-NEXT: v_mov_b32_e32 v0, s5 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[14:15] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB9_6: ; %udiv-end ; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 ; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 +; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -56,19 +56,18 @@ ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 ; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v3, s0 +; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 ; HAWAII-NEXT: s_waitcnt vmcnt(0) ; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b32 v1, v2 +; HAWAII-NEXT: ds_write_b32 v1, v3 ; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: @@ -77,15 +76,14 @@ ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s5 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0xc -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_and_b32 s3, s0, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: v_mov_b32_e32 v2, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s2 +; FIJI-NEXT: s_and_b32 s3, s1, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: v_mov_b32_e32 v2, s1 +; FIJI-NEXT: v_mov_b32_e32 v3, s0 ; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -99,14 +97,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0xc -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s3, s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_and_b32 s3, s1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b16 v1, v2 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, s3, v0 @@ -118,16 +115,15 @@ ; GFX10-LABEL: local_store_i55: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s0, s[4:5], 0xc -; GFX10-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-NEXT: global_load_ubyte_d16_hi v0, v0, s[4:5] offset:14 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s3, s0, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: s_and_b32 s3, s1, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 @@ -140,14 +136,13 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[0:1] offset:14 -; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0xc -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, s2, 0xffff -; GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: s_and_b32 s1, s3, 0xffff +; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s3 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -163,67 +158,62 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x3 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v0, s2 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s0 ; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b32 v0, v1 +; HAWAII-NEXT: ds_write_b32 v0, v2 ; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v0, s2 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: v_mov_b32_e32 v2, s0 ; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b32 v0, v1 +; FIJI-NEXT: ds_write_b32 v0, v2 ; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0xc -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b32 v0, v2 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: local_store_i48: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-NEXT: s_load_dword s1, s[4:5], 0xc -; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX10-NEXT: ds_write_b32 v0, v2 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x2 +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b32 s3, s[0:1], 0xc -; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -5,9 +5,8 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone speculatable ; GCN-LABEL: {{^}}s_sub_i32: -; GCN: s_load_dwordx2 s[[[A:[0-9]+]]:[[B:[0-9]+]]] -; GCN: s_load_dwordx2 -; GCN: s_sub_i32 s{{[0-9]+}}, s[[A]], s[[B]] +; GCN: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}] +; GCN: s_sub_i32 s{{[0-9]+}}, s[[#LOAD + 2]], s[[#LOAD + 3]] define amdgpu_kernel void @s_sub_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { %result = sub i32 %a, %b store i32 %result, i32 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -211,56 +211,58 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s6, s4, 16 -; VI-NEXT: s_lshr_b32 s7, s5, 16 -; VI-NEXT: s_sub_i32 s6, s6, s7 -; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_lshl_b32 s5, s6, 16 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_lshr_b32 s0, s2, 16 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: s_sub_i32 s0, s0, s1 +; VI-NEXT: s_sub_i32 s1, s2, s3 +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_and_b32 s1, s1, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_mov_b32 s5, s1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %add = sub <2 x i16> %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -7,27 +7,28 @@ define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind { ; CHECK-LABEL: foobar: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s6, -1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CHECK-NEXT: ; %bb.1: ; %ift -; CHECK-NEXT: s_mov_b32 s4, s5 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: s_mov_b32 s0, s1 +; CHECK-NEXT: v_mov_b32_e32 v0, s0 +; CHECK-NEXT: v_mov_b32_e32 v1, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: ; %bb.2: ; %ife -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s4, s2 +; CHECK-NEXT: s_mov_b32 s5, s3 +; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; CHECK-NEXT: s_endpgm ; FIXME: The change related to the fact that diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -181,17 +181,18 @@ define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { ; SI-LABEL: s_udiv_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_u32_e32 v0, s3 ; SI-NEXT: s_sub_i32 s4, 0, s3 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: v_mul_lo_u32 v1, s4, v0 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: v_mul_hi_u32 v1, v0, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; SI-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -205,23 +206,23 @@ ; SI-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; SI-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 ; VI-NEXT: s_sub_i32 s4, 0, s3 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; VI-NEXT: v_mul_lo_u32 v1, s4, v0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_mul_hi_u32 v0, s2, v0 @@ -235,33 +236,31 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: s_udiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GCN-NEXT: s_sub_i32 s0, 0, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GCN-NEXT: s_sub_i32 s0, 0, s7 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_mul_lo_u32 v1, s0, v0 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_u32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s7 ; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_sub_u32_e32 v1, vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 +; GCN-NEXT: v_sub_u32_e32 v1, vcc, s6, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s3, v1 +; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s7, v1 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GCN-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 @@ -270,35 +269,33 @@ ; ; GFX1030-LABEL: s_udiv_i32: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX1030-NEXT: s_sub_i32 s3, 0, s1 +; GFX1030-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX1030-NEXT: s_sub_i32 s5, 0, s3 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1030-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1030-NEXT: s_mul_i32 s3, s3, s2 -; GFX1030-NEXT: s_mul_hi_u32 s3, s2, s3 -; GFX1030-NEXT: s_add_i32 s2, s2, s3 -; GFX1030-NEXT: s_mul_hi_u32 s6, s0, s2 -; GFX1030-NEXT: s_mul_i32 s2, s6, s1 -; GFX1030-NEXT: s_sub_i32 s0, s0, s2 -; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 -; GFX1030-NEXT: s_cmp_ge_u32 s0, s1 +; GFX1030-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1030-NEXT: s_mul_i32 s5, s5, s4 +; GFX1030-NEXT: s_mul_hi_u32 s5, s4, s5 +; GFX1030-NEXT: s_add_i32 s4, s4, s5 +; GFX1030-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX1030-NEXT: s_mul_i32 s5, s4, s3 +; GFX1030-NEXT: s_sub_i32 s2, s2, s5 +; GFX1030-NEXT: s_cmp_ge_u32 s2, s3 ; GFX1030-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1030-NEXT: s_add_i32 s7, s6, 1 -; GFX1030-NEXT: s_sub_i32 s4, s0, s1 -; GFX1030-NEXT: v_mov_b32_e32 v0, s7 -; GFX1030-NEXT: v_mov_b32_e32 v1, s4 -; GFX1030-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo -; GFX1030-NEXT: v_cndmask_b32_e32 v1, s0, v1, vcc_lo +; GFX1030-NEXT: s_add_i32 s5, s4, 1 +; GFX1030-NEXT: v_mov_b32_e32 v0, s5 +; GFX1030-NEXT: s_sub_i32 s5, s2, s3 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_cndmask_b32_e32 v0, s4, v0, vcc_lo +; GFX1030-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo ; GFX1030-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v1 +; GFX1030-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v1 ; GFX1030-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: global_store_dword v3, v0, s[2:3] +; GFX1030-NEXT: global_store_dword v3, v0, s[0:1] ; GFX1030-NEXT: s_endpgm ; ; EG-LABEL: s_udiv_i32: diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -672,38 +672,36 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_udiv24_i48: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_load_dword s4, s[0:1], 0xd -; GCN-NEXT: s_load_dword s6, s[0:1], 0xc +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s2, 0xffff -; GCN-NEXT: s_and_b32 s2, s4, 0xff000000 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 +; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_and_b32 s8, s6, 0xffff -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_and_b32 s8, s3, 0xffff +; GCN-NEXT: s_and_b32 s9, s2, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 ; GCN-NEXT: v_mac_f32_e32 v1, 0, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s9, s0, 0xff000000 -; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 -; GCN-NEXT: s_sub_u32 s0, 0, s0 +; GCN-NEXT: s_sub_u32 s2, 0, s2 +; GCN-NEXT: s_subb_u32 s3, 0, s3 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_subb_u32 s1, 0, s1 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 -; GCN-NEXT: v_mul_lo_u32 v6, s0, v1 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s2, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 @@ -722,11 +720,11 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_lo_u32 v5, s3, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 @@ -789,25 +787,22 @@ ; ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s3, s2, 0xffff -; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s3, s7, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s6, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-IR-NEXT: s_and_b32 s0, s0, 0xff000000 ; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 -; GCN-IR-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[0:1], 24 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_mov_b64 s[0:1], 0 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s4 +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s5 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 ; GCN-IR-NEXT: s_min_u32 s10, s6, s7 ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 @@ -833,39 +828,39 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s12 -; GCN-IR-NEXT: s_add_u32 s15, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s2, s14 -; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 +; GCN-IR-NEXT: s_add_u32 s15, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s3, -1 +; GCN-IR-NEXT: s_not_b64 s[0:1], s[10:11] +; GCN-IR-NEXT: s_add_u32 s8, s0, s14 +; GCN-IR-NEXT: s_addc_u32 s9, s1, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s1, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 +; GCN-IR-NEXT: s_lshr_b32 s0, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s2, s15, s12 -; GCN-IR-NEXT: s_subb_u32 s2, s16, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 +; GCN-IR-NEXT: s_sub_u32 s0, s15, s12 +; GCN-IR-NEXT: s_subb_u32 s0, s16, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s0, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s2, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[4:5] +; GCN-IR-NEXT: s_and_b32 s0, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[0:1] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 ; GCN-IR-NEXT: s_branch .LBB7_6 ; GCN-IR-NEXT: .LBB7_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 @@ -873,10 +868,10 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] ; GCN-IR-NEXT: .LBB7_6: ; %udiv-end -; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -28,10 +28,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b: -; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], s[[SGPR0]], [[VGPR1]] +; SI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; VI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[#LOAD + 2]], s[[#LOAD + 2]], [[VGPR1]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1 @@ -40,7 +40,7 @@ } ; GCN-LABEL: {{^}}test_use_s_v_s: -; SI: s_load_dwordx2 s[[[SA:[0-9]+]]:[[SB:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; SI: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} ; SI: buffer_load_dword [[VA0:v[0-9]+]] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] @@ -52,14 +52,14 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_load_dword [[VA1:v[0-9]+]] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI: s_load_dwordx2 s[[[SA:[0-9]+]]:[[SB:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; VI: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} ; GCN-NOT: v_mov_b32 -; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[SB]] +; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], s[[#LOAD + 3]] ; GCN-NOT: v_mov_b32 -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SA]], [[VA0]], [[VB]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SA]], [[VA1]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[#LOAD + 2]], [[VA0]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[#LOAD + 2]], [[VA1]], [[VB]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { @@ -73,10 +73,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a: -; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[SGPR0]], [[VGPR1]], s[[SGPR0]] +; SI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; VI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], s[[#LOAD + 2]], [[VGPR1]], s[[#LOAD + 2]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 @@ -85,10 +85,10 @@ } ; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a: -; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[SGPR0]], s[[SGPR0]] +; SI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; VI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x24 +; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], s[[#LOAD + 2]], s[[#LOAD + 2]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 @@ -151,9 +151,9 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_k_s_x2: -; GCN-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} +; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[#LOAD + 2]] +; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]] ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SK]], [[SK]], [[VGPR0]] ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SK]], [[SK]], [[VGPR1]] @@ -181,9 +181,9 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_k_s_k_x2: -; GCN-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} +; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[#LOAD + 2]] +; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]] ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VGPR0]], [[SK]], [[SK]] ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VGPR1]], [[SK]], [[SK]] @@ -211,9 +211,9 @@ } ; GCN-LABEL: {{^}}test_literal_use_twice_ternary_op_s_k_k_x2: -; GCN-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]{{\:}}[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[SGPR0]] -; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}} +; GCN-DAG: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s[[#LOAD + 2]] +; GCN-DAG: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s[[#LOAD + 3]] ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x44800000 ; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VGPR0]], [[SK]], [[SK]] ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VGPR1]], [[SK]], [[SK]] @@ -229,14 +229,14 @@ } ; GCN-LABEL: {{^}}test_s0_s1_k_f32: -; SI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb -; VI-DAG: s_load_dwordx2 s[[[SGPR0:[0-9]+]]:[[SGPR1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; SI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x9 +; VI-DAG: s_load_dwordx4 s[[[#LOAD:]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x24 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 -; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[SGPR1]] +; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], s[[#LOAD + 3]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK0]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], s[[#LOAD + 2]], [[VS1]], [[VK0]] ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[SGPR0]], [[VS1]], [[VK1]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], s[[#LOAD + 2]], [[VS1]], [[VK1]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -5,8 +5,7 @@ ; The ilpmax scheduler is used for the second test to get the ordering we want for the test. ; DEFAULT-LABEL: {{^}}main: -; DEFAULT: s_load_dwordx4 -; DEFAULT: s_load_dwordx4 +; DEFAULT: s_load_dwordx8 ; DEFAULT: s_waitcnt lgkmcnt(0) ; DEFAULT: buffer_load_format_xyzw ; DEFAULT: buffer_load_format_xyzw @@ -39,11 +38,9 @@ } ; ILPMAX-LABEL: {{^}}main2: -; ILPMAX: s_load_dwordx4 +; ILPMAX: s_load_dwordx8 ; ILPMAX: s_waitcnt lgkmcnt(0) ; ILPMAX: buffer_load -; ILPMAX: s_load_dwordx4 -; ILPMAX: s_waitcnt lgkmcnt(0) ; ILPMAX: buffer_load ; ILPMAX: s_waitcnt vmcnt(0) ; ILPMAX: exp pos0 diff --git a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll --- a/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -4,15 +4,16 @@ define amdgpu_kernel void @zext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) { ; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s4, s4, 0xffff -; GCN-NEXT: s_add_i32 s4, s5, s4 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_and_b32 s0, s2, 0xffff +; GCN-NEXT: s_add_i32 s0, s3, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %zext = zext i16 %a to i32 %res = add i32 %b, %zext