diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -355,18 +355,12 @@ } unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || - AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT || - AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER || - AddrSpace == AMDGPUAS::BUFFER_RESOURCE) { + if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS) return 512; - } if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) return 8 * ST->getMaxPrivateElementSize(); - // Common to flat, global, local and region. Assume for unknown addrspace. return 128; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -851,140 +851,140 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: sdivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s2, s12, 31 -; GFX8-NEXT: s_add_i32 s0, s12, s2 -; GFX8-NEXT: s_xor_b32 s3, s0, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 -; GFX8-NEXT: s_ashr_i32 s16, s13, 31 -; GFX8-NEXT: s_add_i32 s0, s13, s16 +; GFX8-NEXT: s_ashr_i32 s12, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s12 +; GFX8-NEXT: s_xor_b32 s13, s0, s12 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GFX8-NEXT: s_ashr_i32 s15, s1, 31 +; GFX8-NEXT: s_add_i32 s0, s1, s15 +; GFX8-NEXT: s_sub_i32 s1, 0, s13 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_xor_b32 s13, s0, s16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s13 -; GFX8-NEXT: s_ashr_i32 s12, s8, 31 +; GFX8-NEXT: s_xor_b32 s16, s0, s15 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s16 +; GFX8-NEXT: s_ashr_i32 s14, s8, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s0, s8, s12 -; GFX8-NEXT: s_xor_b32 s0, s0, s12 +; GFX8-NEXT: s_add_i32 s0, s8, s14 +; GFX8-NEXT: s_xor_b32 s0, s0, s14 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 -; GFX8-NEXT: s_sub_i32 s8, 0, s13 +; GFX8-NEXT: s_sub_i32 s8, 0, s16 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s13 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s13, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX8-NEXT: s_xor_b32 s0, s12, s2 -; GFX8-NEXT: s_ashr_i32 s2, s9, 31 -; GFX8-NEXT: s_add_i32 s1, s9, s2 +; GFX8-NEXT: s_ashr_i32 s8, s9, 31 +; GFX8-NEXT: s_add_i32 s1, s9, s8 +; GFX8-NEXT: s_xor_b32 s1, s1, s8 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 +; GFX8-NEXT: s_xor_b32 s0, s14, s12 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: s_ashr_i32 s3, s14, 31 +; GFX8-NEXT: s_ashr_i32 s9, s2, 31 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s13 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s12, v2 -; GFX8-NEXT: s_add_i32 s0, s14, s3 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s16 +; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s14, v2 +; GFX8-NEXT: s_add_i32 s0, s2, s9 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v3 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 -; GFX8-NEXT: s_xor_b32 s8, s0, s3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 +; GFX8-NEXT: s_xor_b32 s2, s0, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s13, v2 -; GFX8-NEXT: s_sub_i32 s0, 0, s8 +; GFX8-NEXT: v_subrev_u32_e64 v5, s[0:1], s16, v2 +; GFX8-NEXT: s_sub_i32 s0, 0, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX8-NEXT: v_mul_lo_u32 v5, s0, v3 -; GFX8-NEXT: s_ashr_i32 s9, s10, 31 -; GFX8-NEXT: s_add_i32 s1, s10, s9 -; GFX8-NEXT: s_xor_b32 s1, s1, s9 +; GFX8-NEXT: s_ashr_i32 s12, s10, 31 +; GFX8-NEXT: s_add_i32 s1, s10, s12 +; GFX8-NEXT: s_xor_b32 s1, s1, s12 ; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX8-NEXT: s_xor_b32 s0, s2, s16 -; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX8-NEXT: s_xor_b32 s0, s8, s15 +; GFX8-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, s1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s2, v2 -; GFX8-NEXT: s_ashr_i32 s2, s15, 31 -; GFX8-NEXT: v_mul_lo_u32 v6, v3, s8 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v2 +; GFX8-NEXT: s_ashr_i32 s8, s3, 31 +; GFX8-NEXT: v_mul_lo_u32 v6, v3, s2 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: s_add_i32 s0, s15, s2 +; GFX8-NEXT: s_add_i32 s0, s3, s8 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s1, v6 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX8-NEXT: s_xor_b32 s10, s0, s2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX8-NEXT: s_xor_b32 s3, s0, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s10 -; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v2 +; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s3 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc ; GFX8-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s8, v2 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s2, v2 +; GFX8-NEXT: s_sub_i32 s0, 0, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v7, vcc ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v6 -; GFX8-NEXT: s_xor_b32 s0, s9, s3 -; GFX8-NEXT: s_ashr_i32 s3, s11, 31 -; GFX8-NEXT: s_add_i32 s1, s11, s3 +; GFX8-NEXT: s_ashr_i32 s2, s11, 31 +; GFX8-NEXT: s_add_i32 s1, s11, s2 +; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: v_mul_hi_u32 v2, v6, v2 -; GFX8-NEXT: s_xor_b32 s1, s1, s3 +; GFX8-NEXT: s_xor_b32 s0, s12, s9 ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GFX8-NEXT: v_mul_hi_u32 v8, s1, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s0, v3 -; GFX8-NEXT: v_xor_b32_e32 v3, s9, v7 -; GFX8-NEXT: v_mul_lo_u32 v7, v8, s10 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s9, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, s12, v7 +; GFX8-NEXT: v_mul_lo_u32 v7, v8, s3 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, s12, v3 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v7 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 1, v8 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 1, v7 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s10, v3 +; GFX8-NEXT: v_subrev_u32_e64 v8, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc -; GFX8-NEXT: s_xor_b32 s0, s3, s2 +; GFX8-NEXT: s_xor_b32 s0, s2, s8 ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v7 -; GFX8-NEXT: v_xor_b32_e32 v7, s3, v8 +; GFX8-NEXT: v_xor_b32_e32 v7, s2, v8 ; GFX8-NEXT: v_mov_b32_e32 v9, s5 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 ; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s3, v7 +; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s2, v7 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] @@ -992,235 +992,236 @@ ; ; GFX9-LABEL: sdivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s6, s12, 31 -; GFX9-NEXT: s_add_i32 s0, s12, s6 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_ashr_i32 s4, s13, 31 -; GFX9-NEXT: s_add_i32 s5, s13, s4 +; GFX9-NEXT: s_ashr_i32 s12, s8, 31 +; GFX9-NEXT: s_add_i32 s0, s8, s12 +; GFX9-NEXT: s_xor_b32 s8, s0, s12 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_ashr_i32 s13, s9, 31 +; GFX9-NEXT: s_add_i32 s9, s9, s13 +; GFX9-NEXT: s_xor_b32 s9, s9, s13 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s5, s5, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_sub_i32 s13, 0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_sub_i32 s15, 0, s8 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_ashr_i32 s12, s8, 31 -; GFX9-NEXT: s_add_i32 s8, s8, s12 -; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s14, s4, 31 +; GFX9-NEXT: v_mul_lo_u32 v2, s15, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_xor_b32 s8, s8, s12 +; GFX9-NEXT: s_add_i32 s4, s4, s14 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_sub_i32 s13, 0, s5 -; GFX9-NEXT: v_mul_lo_u32 v3, s13, v1 -; GFX9-NEXT: s_ashr_i32 s13, s9, 31 +; GFX9-NEXT: s_xor_b32 s4, s4, s14 +; GFX9-NEXT: s_sub_i32 s15, 0, s9 +; GFX9-NEXT: v_mul_lo_u32 v3, s15, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: s_ashr_i32 s15, s5, 31 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_add_i32 s9, s9, s13 -; GFX9-NEXT: s_xor_b32 s9, s9, s13 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: s_add_i32 s5, s5, s15 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 +; GFX9-NEXT: s_xor_b32 s5, s5, s15 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v2 -; GFX9-NEXT: s_xor_b32 s6, s12, s6 +; GFX9-NEXT: v_subrev_u32_e32 v3, s8, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: s_ashr_i32 s6, s14, 31 -; GFX9-NEXT: s_add_i32 s7, s14, s6 -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX9-NEXT: s_xor_b32 s7, s7, s6 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s9 +; GFX9-NEXT: s_xor_b32 s4, s14, s12 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 +; GFX9-NEXT: s_ashr_i32 s4, s10, 31 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: s_add_i32 s5, s10, s4 +; GFX9-NEXT: s_xor_b32 s5, s5, s4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s5, v2 +; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 -; GFX9-NEXT: s_sub_i32 s8, 0, s7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 +; GFX9-NEXT: s_sub_i32 s8, 0, s5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s8, v3 -; GFX9-NEXT: s_xor_b32 s4, s13, s4 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: s_ashr_i32 s4, s15, 31 -; GFX9-NEXT: s_add_i32 s9, s15, s4 +; GFX9-NEXT: s_xor_b32 s8, s15, s13 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX9-NEXT: v_subrev_u32_e32 v1, s8, v1 +; GFX9-NEXT: s_ashr_i32 s8, s11, 31 +; GFX9-NEXT: s_add_i32 s10, s11, s8 ; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX9-NEXT: s_xor_b32 s9, s9, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s9 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 -; GFX9-NEXT: s_ashr_i32 s5, s10, 31 -; GFX9-NEXT: s_add_i32 s8, s10, s5 -; GFX9-NEXT: s_xor_b32 s8, s8, s5 +; GFX9-NEXT: s_xor_b32 s10, s10, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s10 +; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v2 +; GFX9-NEXT: s_ashr_i32 s9, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v3 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s7 +; GFX9-NEXT: v_xor_b32_e32 v2, s15, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v3, s5 ; GFX9-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v2 -; GFX9-NEXT: v_sub_u32_e32 v2, s8, v6 -; GFX9-NEXT: s_sub_i32 s8, 0, s9 -; GFX9-NEXT: v_mul_lo_u32 v8, s8, v7 +; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s6, v6 +; GFX9-NEXT: s_sub_i32 s6, 0, s10 +; GFX9-NEXT: v_mul_lo_u32 v8, s6, v7 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v7, v8 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v2 -; GFX9-NEXT: s_ashr_i32 s7, s11, 31 -; GFX9-NEXT: s_add_i32 s8, s11, s7 -; GFX9-NEXT: s_xor_b32 s8, s8, s7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v2 +; GFX9-NEXT: s_ashr_i32 s5, s7, 31 +; GFX9-NEXT: s_add_i32 s6, s7, s5 +; GFX9-NEXT: s_xor_b32 s6, s6, s5 ; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_mul_hi_u32 v7, s8, v7 -; GFX9-NEXT: s_xor_b32 s6, s5, s6 +; GFX9-NEXT: v_mul_hi_u32 v7, s6, v7 +; GFX9-NEXT: s_xor_b32 s4, s9, s4 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, v7, s9 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v7, s10 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: s_xor_b32 s4, s7, s4 -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: s_xor_b32 s4, s5, s8 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v7 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s9, v3 +; GFX9-NEXT: v_subrev_u32_e32 v8, s10, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v8, vcc ; GFX9-NEXT: v_xor_b32_e32 v3, s4, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, s5, v6 +; GFX9-NEXT: v_xor_b32_e32 v6, s9, v6 ; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_xor_b32_e32 v7, s7, v8 +; GFX9-NEXT: v_xor_b32_e32 v7, s5, v8 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subrev_u32_e32 v6, s5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, s7, v7 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s5, v7 ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s0, s12, 31 -; GFX10-NEXT: s_ashr_i32 s1, s13, 31 -; GFX10-NEXT: s_ashr_i32 s2, s14, 31 -; GFX10-NEXT: s_ashr_i32 s3, s15, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s7, s13, s1 -; GFX10-NEXT: s_add_i32 s12, s14, s2 -; GFX10-NEXT: s_add_i32 s13, s15, s3 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 -; GFX10-NEXT: s_xor_b32 s15, s7, s1 -; GFX10-NEXT: s_xor_b32 s12, s12, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX10-NEXT: s_xor_b32 s13, s13, s3 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 +; GFX10-NEXT: s_ashr_i32 s4, s0, 31 +; GFX10-NEXT: s_ashr_i32 s5, s1, 31 +; GFX10-NEXT: s_ashr_i32 s6, s2, 31 +; GFX10-NEXT: s_ashr_i32 s7, s3, 31 +; GFX10-NEXT: s_add_i32 s0, s0, s4 +; GFX10-NEXT: s_add_i32 s1, s1, s5 +; GFX10-NEXT: s_add_i32 s2, s2, s6 +; GFX10-NEXT: s_add_i32 s3, s3, s7 +; GFX10-NEXT: s_xor_b32 s16, s0, s4 +; GFX10-NEXT: s_xor_b32 s17, s1, s5 +; GFX10-NEXT: s_xor_b32 s18, s2, s6 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s16 +; GFX10-NEXT: s_xor_b32 s3, s3, s7 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s17 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s18 +; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s14 +; GFX10-NEXT: s_sub_i32 s0, 0, s16 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX10-NEXT: s_sub_i32 s7, 0, s15 -; GFX10-NEXT: s_sub_i32 s19, 0, s12 -; GFX10-NEXT: s_ashr_i32 s16, s8, 31 -; GFX10-NEXT: s_ashr_i32 s17, s9, 31 -; GFX10-NEXT: s_ashr_i32 s18, s10, 31 +; GFX10-NEXT: s_sub_i32 s1, 0, s17 +; GFX10-NEXT: s_sub_i32 s2, 0, s18 +; GFX10-NEXT: s_ashr_i32 s19, s12, 31 +; GFX10-NEXT: s_ashr_i32 s20, s13, 31 +; GFX10-NEXT: s_ashr_i32 s21, s14, 31 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_xor_b32 s20, s16, s0 +; GFX10-NEXT: s_ashr_i32 s22, s15, 31 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b32 s21, s17, s1 +; GFX10-NEXT: s_xor_b32 s4, s19, s4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 -; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s6, s8, s16 -; GFX10-NEXT: s_add_i32 s7, s9, s17 +; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0 +; GFX10-NEXT: s_sub_i32 s0, 0, s3 +; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1 +; GFX10-NEXT: v_mul_lo_u32 v6, s2, v2 +; GFX10-NEXT: v_mul_lo_u32 v7, s0, v3 +; GFX10-NEXT: s_add_i32 s0, s12, s19 +; GFX10-NEXT: s_add_i32 s1, s13, s20 +; GFX10-NEXT: s_add_i32 s2, s14, s21 ; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 -; GFX10-NEXT: s_add_i32 s8, s10, s18 +; GFX10-NEXT: s_add_i32 s12, s15, s22 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: s_add_i32 s9, s11, s19 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_xor_b32 s11, s7, s17 +; GFX10-NEXT: s_xor_b32 s0, s0, s19 +; GFX10-NEXT: s_xor_b32 s1, s1, s20 +; GFX10-NEXT: s_xor_b32 s2, s2, s21 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 -; GFX10-NEXT: s_xor_b32 s8, s8, s18 +; GFX10-NEXT: s_xor_b32 s12, s12, s22 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 -; GFX10-NEXT: s_xor_b32 s9, s9, s19 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 -; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 -; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 -; GFX10-NEXT: s_xor_b32 s22, s18, s2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 -; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 -; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 +; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX10-NEXT: s_xor_b32 s5, s20, s5 +; GFX10-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX10-NEXT: v_mul_hi_u32 v2, s2, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s12, v3 +; GFX10-NEXT: s_xor_b32 s6, s21, s6 +; GFX10-NEXT: v_mul_lo_u32 v4, v0, s16 ; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX10-NEXT: v_mul_lo_u32 v5, v1, s17 +; GFX10-NEXT: v_mul_lo_u32 v6, v2, s18 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5 -; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6 -; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s0, v4 +; GFX10-NEXT: v_sub_nc_u32_e32 v5, s1, v5 +; GFX10-NEXT: v_sub_nc_u32_e32 v6, s2, v6 +; GFX10-NEXT: v_sub_nc_u32_e32 v7, s12, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s16, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s17, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s17, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s18, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 @@ -1229,43 +1230,42 @@ ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s16, v4 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s17, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s18, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s16, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s17, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s18, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: s_xor_b32 s0, s19, s3 +; GFX10-NEXT: s_xor_b32 s0, s22, s7 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 -; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2 +; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_xor_b32_e32 v2, s6, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX10-NEXT: v_xor_b32_e32 v4, s16, v4 -; GFX10-NEXT: v_xor_b32_e32 v5, s17, v5 -; GFX10-NEXT: v_xor_b32_e32 v6, s18, v6 -; GFX10-NEXT: v_xor_b32_e32 v7, s19, v7 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s20, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s21, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s22, v2 +; GFX10-NEXT: v_xor_b32_e32 v4, s19, v4 +; GFX10-NEXT: v_xor_b32_e32 v5, s20, v5 +; GFX10-NEXT: v_xor_b32_e32 v6, s21, v6 +; GFX10-NEXT: v_xor_b32_e32 v7, s22, v7 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s16, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s17, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s18, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s19, v7 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] -; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v4, s19, v4 +; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s20, v5 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s21, v6 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s22, v7 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9] +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv <4 x i32> %x, %y store <4 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -691,8 +691,8 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) { ; GFX8-LABEL: udivrem_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 +; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -789,15 +789,16 @@ ; ; GFX9-LABEL: udivrem_v4i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_sub_i32 s0, 0, s12 -; GFX9-NEXT: s_sub_i32 s1, 0, s13 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_sub_i32 s0, 0, s8 +; GFX9-NEXT: s_sub_i32 s1, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 +; GFX9-NEXT: s_sub_i32 s12, 0, s10 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -805,88 +806,87 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v4 ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_sub_i32 s4, 0, s14 +; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s12 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s13 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s9 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 -; GFX9-NEXT: v_sub_u32_e32 v7, s9, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s5, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s12, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s15 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v7 +; GFX9-NEXT: v_mul_lo_u32 v3, s12, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 -; GFX9-NEXT: v_subrev_u32_e32 v6, s13, v7 +; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 -; GFX9-NEXT: v_mul_hi_u32 v2, s10, v2 +; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v6 -; GFX9-NEXT: s_sub_i32 s4, 0, s15 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 +; GFX9-NEXT: s_sub_i32 s4, 0, s11 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v2, s14 +; GFX9-NEXT: v_mul_lo_u32 v7, v2, s10 ; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 -; GFX9-NEXT: v_subrev_u32_e32 v5, s13, v6 +; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, s10, v7 +; GFX9-NEXT: v_sub_u32_e32 v6, s6, v7 ; GFX9-NEXT: v_mul_hi_u32 v7, v3, v8 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 -; GFX9-NEXT: v_mul_hi_u32 v3, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v7, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 -; GFX9-NEXT: v_mul_lo_u32 v8, v3, s15 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v6 +; GFX9-NEXT: v_mul_lo_u32 v8, v3, s11 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v7, s14, v6 +; GFX9-NEXT: v_subrev_u32_e32 v7, s10, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX9-NEXT: v_sub_u32_e32 v7, s11, v8 +; GFX9-NEXT: v_sub_u32_e32 v7, s7, v8 ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_subrev_u32_e32 v8, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_add_u32_e32 v8, 1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; GFX9-NEXT: v_subrev_u32_e32 v8, s15, v7 +; GFX9-NEXT: v_subrev_u32_e32 v8, s11, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 +; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -104,9 +104,9 @@ ; VI-NEXT: s_lshr_b32 s3, s0, 16 ; VI-NEXT: s_add_i32 s2, s2, s0 ; VI-NEXT: s_add_i32 s1, s1, s3 -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_lshl_b32 s1, s1, 16 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s0, s1, 16 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -15,9 +15,6 @@ ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr33, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 @@ -33,7 +30,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -46,7 +43,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr20, $vgpr22 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF @@ -57,17 +54,18 @@ ; GFX90A-NEXT: renamable $vgpr27 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.3.Flow17: + ; GFX90A-NEXT: bb.3.Flow15: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec @@ -115,7 +113,7 @@ ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.6.Flow20: + ; GFX90A-NEXT: bb.6.Flow18: ; GFX90A-NEXT: successors: %bb.7(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -128,7 +126,7 @@ ; GFX90A-NEXT: renamable $vgpr27 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr26 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.7.Flow19: + ; GFX90A-NEXT: bb.7.Flow17: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -136,7 +134,7 @@ ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.8.Flow32: + ; GFX90A-NEXT: bb.8.Flow30: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -153,7 +151,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.10.Flow33: + ; GFX90A-NEXT: bb.10.Flow31: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -170,7 +168,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.12.Flow34: + ; GFX90A-NEXT: bb.12.Flow32: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -187,7 +185,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.14.Flow35: + ; GFX90A-NEXT: bb.14.Flow33: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -210,7 +208,7 @@ ; GFX90A-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr18_sgpr19, @f2, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.16.Flow36: + ; GFX90A-NEXT: bb.16.Flow34: ; GFX90A-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) ; GFX90A-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -227,7 +225,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr46, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.18.Flow37: + ; GFX90A-NEXT: bb.18.Flow35: ; GFX90A-NEXT: successors: %bb.19(0x40000000), %bb.20(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -244,7 +242,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr62, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.20.Flow38: + ; GFX90A-NEXT: bb.20.Flow36: ; GFX90A-NEXT: successors: %bb.21(0x40000000), %bb.22(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -261,7 +259,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr60, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.22.Flow39: + ; GFX90A-NEXT: bb.22.Flow37: ; GFX90A-NEXT: successors: %bb.23(0x40000000), %bb.24(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -278,7 +276,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr58, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.24.Flow40: + ; GFX90A-NEXT: bb.24.Flow38: ; GFX90A-NEXT: successors: %bb.25(0x40000000), %bb.26(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -295,7 +293,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr56, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.26.Flow41: + ; GFX90A-NEXT: bb.26.Flow39: ; GFX90A-NEXT: successors: %bb.27(0x40000000), %bb.28(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -312,7 +310,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr44, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.28.Flow42: + ; GFX90A-NEXT: bb.28.Flow40: ; GFX90A-NEXT: successors: %bb.34(0x40000000), %bb.29(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -321,7 +319,7 @@ ; GFX90A-NEXT: renamable $sgpr4_sgpr5 = S_XOR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.34, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.29.Flow43: + ; GFX90A-NEXT: bb.29.Flow41: ; GFX90A-NEXT: successors: %bb.30(0x40000000), %bb.31(0x40000000) ; GFX90A-NEXT: liveins: $sgpr4_sgpr5, $sgpr34_sgpr35, $sgpr58_sgpr59, $vgpr40_vgpr41:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -337,7 +335,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.31.Flow44: + ; GFX90A-NEXT: bb.31.Flow42: ; GFX90A-NEXT: successors: %bb.32(0x40000000), %bb.33(0x40000000) ; GFX90A-NEXT: liveins: $sgpr58_sgpr59, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -366,7 +364,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec @@ -404,7 +402,7 @@ ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.36.Flow21: + ; GFX90A-NEXT: bb.36.Flow19: ; GFX90A-NEXT: successors: %bb.6(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -413,12 +411,12 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -441,9 +439,9 @@ ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.38.Flow22: + ; GFX90A-NEXT: bb.38.Flow20: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -458,18 +456,18 @@ ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr58_sgpr59, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -491,9 +489,9 @@ ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.40.Flow23: + ; GFX90A-NEXT: bb.40.Flow21: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -506,14 +504,14 @@ ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr58_sgpr59, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr60_sgpr61, killed renamable $sgpr58_sgpr59, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -521,10 +519,10 @@ ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec :: (load (s8) from %ir.i42, addrspace 1) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr20, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -543,33 +541,33 @@ ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.42.Flow24: + ; GFX90A-NEXT: bb.42.Flow22: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr58_sgpr59, killed renamable $sgpr60_sgpr61, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr60_sgpr61, killed renamable $sgpr58_sgpr59, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_XOR_B64 renamable $sgpr62_sgpr63, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc @@ -597,38 +595,38 @@ ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.45.Flow26: + ; GFX90A-NEXT: bb.45.Flow24: ; GFX90A-NEXT: successors: %bb.47(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec :: (load (s8) from %ir.i49, addrspace 1) - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -646,27 +644,27 @@ ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.47.Flow25: + ; GFX90A-NEXT: bb.47.Flow23: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr58_sgpr59, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc @@ -680,7 +678,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc @@ -708,7 +706,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc @@ -718,11 +716,11 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF @@ -737,7 +735,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec @@ -757,34 +755,35 @@ ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.55.Flow29: + ; GFX90A-NEXT: bb.55.Flow27: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr58_sgpr59, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr62_sgpr63, implicit $exec + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg2.kernarg.offset, align 16, addrspace 4) ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr58, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr5, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr50, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr51, killed $vgpr5, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: @@ -826,8 +825,9 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg10.kernarg.offset, align 8, addrspace 4) ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec @@ -835,7 +835,7 @@ ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.arg10.load, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 @@ -845,7 +845,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec @@ -853,7 +853,7 @@ ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF @@ -865,25 +865,25 @@ ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.60.Flow31: + ; GFX90A-NEXT: bb.60.Flow29: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.61.Flow30: + ; GFX90A-NEXT: bb.61.Flow28: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr58_sgpr59, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr50_sgpr51, killed renamable $sgpr60_sgpr61, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: @@ -894,7 +894,7 @@ ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.63.Flow13: + ; GFX90A-NEXT: bb.63.Flow11: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -910,14 +910,14 @@ ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.65.Flow10: + ; GFX90A-NEXT: bb.65.Flow8: ; GFX90A-NEXT: successors: %bb.66(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.66.Flow14: + ; GFX90A-NEXT: bb.66.Flow12: ; GFX90A-NEXT: successors: %bb.8(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} @@ -997,7 +997,7 @@ ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: bb.71.Flow9: + ; GFX90A-NEXT: bb.71.Flow7: ; GFX90A-NEXT: successors: %bb.63(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll @@ -6,14 +6,14 @@ define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) { ; CHECK-LABEL: eggs: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8 +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp0_b32 s0, 0 +; CHECK-NEXT: s_bitcmp0_b32 s6, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb10 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[8:9] +; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v0 ; CHECK-NEXT: v_lshrrev_b32_e32 v6, 16, v0 @@ -32,21 +32,24 @@ ; CHECK-NEXT: v_mov_b32_e32 v7, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: .LBB0_3: ; %bb41 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x48 -; CHECK-NEXT: v_mov_b32_e32 v8, s10 -; CHECK-NEXT: v_mov_b32_e32 v9, s11 -; CHECK-NEXT: v_mov_b32_e32 v10, s12 -; CHECK-NEXT: v_mov_b32_e32 v11, s13 -; CHECK-NEXT: v_mov_b32_e32 v12, s14 -; CHECK-NEXT: v_mov_b32_e32 v13, s15 -; CHECK-NEXT: v_mov_b32_e32 v14, s16 -; CHECK-NEXT: v_mov_b32_e32 v15, s17 -; CHECK-NEXT: v_mov_b32_e32 v16, s18 -; CHECK-NEXT: v_mov_b32_e32 v17, s19 -; CHECK-NEXT: v_mov_b32_e32 v18, s20 -; CHECK-NEXT: v_mov_b32_e32 v19, s21 -; CHECK-NEXT: v_mov_b32_e32 v20, s22 -; CHECK-NEXT: v_mov_b32_e32 v21, s23 +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x18 +; CHECK-NEXT: v_mov_b32_e32 v8, s2 +; CHECK-NEXT: v_mov_b32_e32 v9, s3 +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x38 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v10, s8 +; CHECK-NEXT: v_mov_b32_e32 v11, s9 +; CHECK-NEXT: v_mov_b32_e32 v12, s10 +; CHECK-NEXT: v_mov_b32_e32 v13, s11 +; CHECK-NEXT: v_mov_b32_e32 v14, s12 +; CHECK-NEXT: v_mov_b32_e32 v15, s13 +; CHECK-NEXT: v_mov_b32_e32 v16, s14 +; CHECK-NEXT: v_mov_b32_e32 v17, s15 +; CHECK-NEXT: v_mov_b32_e32 v18, s0 +; CHECK-NEXT: v_mov_b32_e32 v19, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s2 +; CHECK-NEXT: v_mov_b32_e32 v21, s3 ; CHECK-NEXT: flat_store_byte v[8:9], v0 ; CHECK-NEXT: flat_store_byte v[10:11], v7 ; CHECK-NEXT: flat_store_byte v[12:13], v6 @@ -54,8 +57,7 @@ ; CHECK-NEXT: flat_store_byte v[16:17], v1 ; CHECK-NEXT: flat_store_byte v[18:19], v4 ; CHECK-NEXT: flat_store_byte v[20:21], v3 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] ; CHECK-NEXT: flat_store_byte v[0:1], v2 ; CHECK-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -484,53 +484,59 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x double> %mag, <3 x double> %sign) { ; SI-LABEL: s_test_copysign_v3f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x15 +; SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x19 +; SI-NEXT: s_load_dword s0, s[0:1], 0x1e +; SI-NEXT: s_brev_b32 s1, -2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s10, -2 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s7 +; SI-NEXT: v_mov_b32_e32 v0, s11 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s10, v0, v1 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s1, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s10, v0, v2 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: v_bfi_b32 v1, s1, v0, v1 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_bfi_b32 v5, s1, v0, v2 +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v3f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s2, -2 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x64 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x54 +; VI-NEXT: s_load_dword s0, s[0:1], 0x78 +; VI-NEXT: s_brev_b32 s1, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 -; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_bfi_b32 v3, s1, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_bfi_b32 v1, s1, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s9 -; VI-NEXT: v_mov_b32_e32 v2, s17 -; VI-NEXT: v_bfi_b32 v5, s2, v0, v2 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s2, 16 +; VI-NEXT: v_bfi_b32 v5, s1, v0, v2 +; VI-NEXT: s_addc_u32 s1, s3, 0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %result = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign) @@ -541,45 +547,46 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %mag, <4 x double> %sign) { ; SI-LABEL: s_test_copysign_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x19 +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_mov_b32 s23, 0xf000 +; SI-NEXT: s_mov_b32 s22, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_brev_b32 s12, -2 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: v_mov_b32_e32 v1, s15 -; SI-NEXT: v_bfi_b32 v3, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v3, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_bfi_b32 v1, s12, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, s11 ; SI-NEXT: v_mov_b32_e32 v2, s19 -; SI-NEXT: v_bfi_b32 v7, s12, v0, v2 +; SI-NEXT: v_bfi_b32 v7, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v0, s9 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_bfi_b32 v5, s12, v0, v2 +; SI-NEXT: v_bfi_b32 v5, s0, v0, v2 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_test_copysign_v4f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0x64 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_brev_b32 s2, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: v_mov_b32_e32 v2, s5 ; VI-NEXT: v_bfi_b32 v3, s2, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s13 -; VI-NEXT: v_bfi_b32 v1, s2, v2, v0 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v2, s19 ; VI-NEXT: v_bfi_b32 v7, s2, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -43,41 +43,41 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_add_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_add_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -226,9 +226,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -244,9 +244,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -391,41 +391,41 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_and_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_and_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -574,9 +574,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -592,9 +592,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -739,41 +739,41 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_sub_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_sub_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -922,9 +922,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -940,9 +940,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1085,40 +1085,40 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_max_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_max_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1264,9 +1264,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -1282,9 +1282,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -1425,40 +1425,40 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umax_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umax_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1604,9 +1604,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -1622,9 +1622,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -1765,40 +1765,40 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_min_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_min_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -1944,9 +1944,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -1962,9 +1962,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -2105,40 +2105,40 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_umin_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_umin_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm @@ -2284,9 +2284,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt lgkmcnt(0) @@ -2302,9 +2302,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt lgkmcnt(0) @@ -2447,41 +2447,41 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_or_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_or_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -2630,9 +2630,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2648,9 +2648,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2871,41 +2871,41 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xchg_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xchg_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -3054,9 +3054,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3072,9 +3072,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3219,41 +3219,41 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_xor_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_xor_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -3402,9 +3402,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3420,9 +3420,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3939,10 +3939,10 @@ ; GCN1-NEXT: s_addc_u32 s1, s1, 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3960,10 +3960,10 @@ ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4132,9 +4132,9 @@ ; GCN1: ; %bb.0: ; %entry ; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v4, s0 ; GCN1-NEXT: v_mov_b32_e32 v5, s1 -; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v3, s7 @@ -4151,9 +4151,9 @@ ; GCN2: ; %bb.0: ; %entry ; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v4, s0 ; GCN2-NEXT: v_mov_b32_e32 v5, s1 -; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v3, s7 @@ -4627,41 +4627,41 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_inc_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_inc_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -4810,9 +4810,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4828,9 +4828,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4975,41 +4975,41 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { ; GCN1-LABEL: atomic_dec_i64_ret_offset: ; GCN1: ; %bb.0: ; %entry -; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: s_add_u32 s0, s0, 32 -; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_add_u32 s2, s4, 32 +; GCN1-NEXT: s_addc_u32 s3, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 ; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_ret_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: s_add_u32 s0, s0, 32 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s2, s4, 32 +; GCN2-NEXT: s_addc_u32 s3, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 ; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm entry: @@ -5158,9 +5158,9 @@ ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5176,9 +5176,9 @@ ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -105,11 +105,11 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 -; VI-NEXT: v_rcp_f32_e32 v5, v5 -; VI-NEXT: v_mul_f32_e32 v3, v3, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; VI-NEXT: v_rcp_f32_e32 v3, v3 +; VI-NEXT: v_mul_f32_e32 v3, v5, v3 ; VI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v3, v3 @@ -126,11 +126,11 @@ ; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 @@ -149,11 +149,11 @@ ; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 @@ -172,13 +172,13 @@ ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1068,9 +1068,9 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -1276,19 +1276,20 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] @@ -1469,19 +1470,20 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; VI-NEXT: v_fma_f64 v[6:7], v[8:9], v[6:7], v[6:7] +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mul_f64 v[8:9], v[2:3], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[4:5], v[8:9], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[8:9] @@ -1716,22 +1718,22 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v7, v7 -; VI-NEXT: v_mul_f32_e32 v5, v5, v7 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v3 +; VI-NEXT: v_rcp_f32_e32 v5, v5 +; VI-NEXT: v_mul_f32_e32 v5, v7, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3 +; VI-NEXT: v_div_fixup_f16 v5, v5, v3, v6 ; VI-NEXT: v_trunc_f16_e32 v5, v5 -; VI-NEXT: v_fma_f16 v3, -v5, v6, v3 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 -; VI-NEXT: v_cvt_f32_f16_e32 v5, v4 +; VI-NEXT: v_fma_f16 v3, -v5, v3, v6 +; VI-NEXT: v_cvt_f32_f16_e32 v5, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v4 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; VI-NEXT: v_rcp_f32_e32 v6, v6 -; VI-NEXT: v_mul_f32_e32 v5, v5, v6 +; VI-NEXT: v_rcp_f32_e32 v5, v5 +; VI-NEXT: v_mul_f32_e32 v5, v6, v5 ; VI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4 ; VI-NEXT: v_trunc_f16_e32 v5, v5 @@ -1749,21 +1751,21 @@ ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v4, v4 @@ -1783,21 +1785,21 @@ ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v4, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v4, v4 @@ -1817,13 +1819,13 @@ ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1832,12 +1834,12 @@ ; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 -; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v5 +; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1 @@ -2081,42 +2083,42 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 -; VI-NEXT: v_rcp_f32_e32 v9, v9 -; VI-NEXT: v_mul_f32_e32 v7, v7, v9 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_cvt_f32_f16_e32 v9, v8 +; VI-NEXT: v_rcp_f32_e32 v7, v7 +; VI-NEXT: v_mul_f32_e32 v7, v9, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6 +; VI-NEXT: v_div_fixup_f16 v7, v7, v6, v8 ; VI-NEXT: v_trunc_f16_e32 v7, v7 -; VI-NEXT: v_fma_f16 v6, -v7, v8, v6 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 +; VI-NEXT: v_fma_f16 v6, -v7, v6, v8 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v5 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v7, v7, v8 +; VI-NEXT: v_rcp_f32_e32 v7, v7 +; VI-NEXT: v_mul_f32_e32 v7, v8, v7 ; VI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3 ; VI-NEXT: v_trunc_f16_e32 v7, v7 ; VI-NEXT: v_fma_f16 v3, -v7, v5, v3 -; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 -; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; VI-NEXT: v_or_b32_e32 v3, v3, v6 ; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 -; VI-NEXT: v_rcp_f32_e32 v8, v8 -; VI-NEXT: v_mul_f32_e32 v6, v6, v8 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 +; VI-NEXT: v_rcp_f32_e32 v6, v6 +; VI-NEXT: v_mul_f32_e32 v6, v8, v6 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5 +; VI-NEXT: v_div_fixup_f16 v6, v6, v5, v7 ; VI-NEXT: v_trunc_f16_e32 v6, v6 -; VI-NEXT: v_fma_f16 v5, -v6, v7, v5 -; VI-NEXT: v_cvt_f32_f16_e32 v7, v4 -; VI-NEXT: v_cvt_f32_f16_e32 v6, v2 +; VI-NEXT: v_fma_f16 v5, -v6, v5, v7 +; VI-NEXT: v_cvt_f32_f16_e32 v6, v4 +; VI-NEXT: v_cvt_f32_f16_e32 v7, v2 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; VI-NEXT: v_rcp_f32_e32 v7, v7 -; VI-NEXT: v_mul_f32_e32 v6, v6, v7 +; VI-NEXT: v_rcp_f32_e32 v6, v6 +; VI-NEXT: v_mul_f32_e32 v6, v7, v6 ; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2 ; VI-NEXT: v_trunc_f16_e32 v6, v6 @@ -2134,40 +2136,40 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, v6, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5 ; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX9-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GFX9-NEXT: v_rcp_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX9-NEXT: v_trunc_f16_e32 v6, v6 ; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-NEXT: v_rcp_f32_e32 v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 ; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2 +; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX9-NEXT: v_rcp_f32_e32 v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX9-NEXT: v_rcp_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v5, v6, v5 ; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX9-NEXT: v_trunc_f16_e32 v5, v5 @@ -2187,40 +2189,40 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3 +; GFX10-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-NEXT: v_mul_f32_e32 v5, v6, v5 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 ; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX10-NEXT: v_rcp_f32_e32 v7, v7 -; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GFX10-NEXT: v_rcp_f32_e32 v6, v6 +; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX10-NEXT: v_trunc_f16_e32 v6, v6 ; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v5 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 ; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX10-NEXT: v_rcp_f32_e32 v6, v6 -; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX10-NEXT: v_rcp_f32_e32 v5, v5 +; GFX10-NEXT: v_mul_f32_e32 v5, v6, v5 ; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0 ; GFX10-NEXT: v_trunc_f16_e32 v5, v5 @@ -2240,13 +2242,13 @@ ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX11-NEXT: v_mul_f32_e32 v5, v6, v5 ; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1 @@ -2255,25 +2257,25 @@ ; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v1 -; GFX11-NEXT: v_rcp_f32_e32 v7, v7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v1 +; GFX11-NEXT: v_rcp_f32_e32 v6, v6 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, v6, v7 +; GFX11-NEXT: v_mul_f32_e32 v6, v7, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v6, v6 ; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f16_e32 v6, v6 ; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 -; GFX11-NEXT: v_rcp_f32_e32 v5, v5 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0 @@ -2283,12 +2285,12 @@ ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v6, v6 +; GFX11-NEXT: v_rcp_f32_e32 v5, v5 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6 +; GFX11-NEXT: v_mul_f32_e32 v5, v6, v5 ; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -59,11 +59,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -79,11 +79,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -304,8 +304,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc @@ -324,8 +324,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc @@ -539,11 +539,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -559,11 +559,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -784,8 +784,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc @@ -804,8 +804,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc @@ -1019,11 +1019,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1039,11 +1039,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1264,8 +1264,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc @@ -1284,8 +1284,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc @@ -1493,11 +1493,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1512,11 +1512,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1720,8 +1720,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc @@ -1739,8 +1739,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc @@ -1937,11 +1937,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1956,11 +1956,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2164,8 +2164,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc @@ -2183,8 +2183,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc @@ -2381,11 +2381,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2400,11 +2400,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -2608,8 +2608,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc @@ -2627,8 +2627,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc @@ -2825,11 +2825,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -2844,11 +2844,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3052,8 +3052,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc @@ -3071,8 +3071,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc @@ -3275,11 +3275,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3295,11 +3295,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -3520,8 +3520,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc @@ -3540,8 +3540,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc @@ -3849,11 +3849,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -3869,11 +3869,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4094,8 +4094,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc @@ -4114,8 +4114,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc @@ -4329,11 +4329,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4349,11 +4349,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4574,8 +4574,8 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: s_mov_b32 s1, s5 ; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc @@ -4594,8 +4594,8 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s4 -; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc @@ -4882,11 +4882,11 @@ ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: s_mov_b32 s8, s2 ; CI-NEXT: s_mov_b32 s9, s3 ; CI-NEXT: s_mov_b32 s2, s10 ; CI-NEXT: s_mov_b32 s3, s11 -; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 @@ -4903,11 +4903,11 @@ ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s8, s2 ; VI-NEXT: s_mov_b32 s9, s3 ; VI-NEXT: s_mov_b32 s2, s10 ; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -5154,9 +5154,9 @@ ; CI-NEXT: s_mov_b32 s11, 0xf000 ; CI-NEXT: s_mov_b32 s10, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: s_mov_b32 s8, s0 ; CI-NEXT: s_mov_b32 s9, s1 -; CI-NEXT: v_mov_b32_e32 v0, s4 ; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_mov_b32_e32 v2, s6 ; CI-NEXT: v_mov_b32_e32 v3, s7 @@ -5175,9 +5175,9 @@ ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_mov_b32 s8, s0 ; VI-NEXT: s_mov_b32 s9, s1 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -6026,11 +6026,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6046,11 +6046,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6197,11 +6197,11 @@ ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b32 s0, s6 -; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s1, s7 ; CI-NEXT: s_mov_b32 s6, s2 ; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: v_mov_b32_e32 v1, s9 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -6217,11 +6217,11 @@ ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -17,7 +17,7 @@ ; CHECK: .private_segment_fixed_size: 0 ; CHECK: .sgpr_count: 6 ; CHECK: .symbol: test.kd -; CHECK: .vgpr_count: {{3|6}} +; CHECK: .vgpr_count: {{3|5}} ; WAVE64: .wavefront_size: 64 ; WAVE32: .wavefront_size: 32 define amdgpu_kernel void @test( diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -17,7 +17,7 @@ ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 6 -; CHECK: NumVGPRs: {{3|6}} +; CHECK: NumVGPRs: {{3|5}} ; CHECK: MaxFlatWorkGroupSize: 1024 define amdgpu_kernel void @test( ptr addrspace(1) %r, @@ -40,7 +40,7 @@ ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 6 -; CHECK: NumVGPRs: {{3|6}} +; CHECK: NumVGPRs: {{3|5}} ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test_max_flat_workgroup_size( ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -110,13 +110,13 @@ ; GCN-LABEL: float8_inselt: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44 -; GCN-NEXT: s_load_dword s2, s[0:1], 0x64 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s1, s[0:1], 0x64 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: s_add_u32 s2, s0, 16 -; GCN-NEXT: s_addc_u32 s3, s1, 0 +; GCN-NEXT: s_add_u32 s0, s2, 16 +; GCN-NEXT: s_mov_b32 m0, s1 +; GCN-NEXT: s_addc_u32 s1, s3, 0 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s7 @@ -124,13 +124,13 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s9 ; GCN-NEXT: v_mov_b32_e32 v6, s10 ; GCN-NEXT: v_mov_b32_e32 v7, s11 -; GCN-NEXT: v_mov_b32_e32 v9, s3 +; GCN-NEXT: v_mov_b32_e32 v9, s1 ; GCN-NEXT: v_movreld_b32_e32 v0, 1.0 -; GCN-NEXT: v_mov_b32_e32 v8, s2 +; GCN-NEXT: v_mov_b32_e32 v8, s0 ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -654,8 +654,8 @@ define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 @@ -677,8 +677,8 @@ ; ; VI-LABEL: dynamic_insertelement_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 @@ -1154,8 +1154,8 @@ ; SI-LABEL: dynamic_insertelement_v8i32: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; SI-NEXT: s_load_dword s6, s[4:5], 0x10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; SI-NEXT: s_load_dword s4, s[4:5], 0x10 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1167,7 +1167,7 @@ ; SI-NEXT: v_mov_b32_e32 v5, s13 ; SI-NEXT: v_mov_b32_e32 v6, s14 ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: s_mov_b32 m0, s4 +; SI-NEXT: s_mov_b32 m0, s6 ; SI-NEXT: v_movreld_b32_e32 v0, 5 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -1176,8 +1176,8 @@ ; VI-LABEL: dynamic_insertelement_v8i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 +; VI-NEXT: s_load_dword s6, s[4:5], 0x40 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dword s4, s[4:5], 0x40 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1189,7 +1189,7 @@ ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v6, s14 ; VI-NEXT: v_mov_b32_e32 v7, s15 -; VI-NEXT: s_mov_b32 m0, s4 +; VI-NEXT: s_mov_b32 m0, s6 ; VI-NEXT: v_movreld_b32_e32 v0, 5 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -380,16 +380,16 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; SI-NEXT: s_lshr_b32 s2, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 @@ -411,69 +411,69 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_max_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v2f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v2f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 -; GFX10-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v2f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v1, s0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_max_f16 v0, v1, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -692,11 +692,11 @@ ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -719,40 +719,40 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 -; VI-NEXT: v_max_f16_e32 v1, v2, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_max_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 -; GFX9-NEXT: v_pk_max_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX9-NEXT: v_pk_max_f16 v1, v2, v1 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -760,11 +760,11 @@ ; GFX10-LABEL: maxnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -772,8 +772,8 @@ ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX10-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX10-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX10-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -781,24 +781,24 @@ ; GFX11-LABEL: maxnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_max_f16 v1, v2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v0, v3, v0 +; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v1, v2 +; GFX11-NEXT: v_pk_max_f16 v0, v0, v3 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -827,27 +827,27 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: s_lshr_b32 s6, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_max_f32_e32 v3, v3, v4 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_max_f32_e32 v1, v1, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_max_f32_e32 v2, v2, v4 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 @@ -872,56 +872,56 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 ; VI-NEXT: s_lshr_b32 s0, s9, 16 -; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_max_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_max_f16_e32 v1, v1, v2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_max_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_max_f16_e64 v3, s8, s8 +; VI-NEXT: v_max_f16_e32 v2, v2, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: maxnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 -; GFX9-NEXT: v_pk_max_f16 v1, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v1, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 -; GFX9-NEXT: v_pk_max_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_max_f16 v0, v2, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: maxnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -929,30 +929,30 @@ ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX10-NEXT: v_pk_max_f16 v0, v3, v2 +; GFX10-NEXT: v_pk_max_f16 v1, v0, v1 +; GFX10-NEXT: v_pk_max_f16 v0, v2, v3 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: maxnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_max_f16 v1, v1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_max_f16 v0, v3, v2 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_pk_max_f16 v1, v0, v1 +; GFX11-NEXT: v_pk_max_f16 v0, v2, v3 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -405,16 +405,16 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 +; SI-NEXT: s_lshr_b32 s2, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 @@ -436,68 +436,68 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_min_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v2f16_ieee: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 -; GFX9-NEXT: s_load_dword s11, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s10, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s11, s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v2f16_ieee: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v1, s1, s1 -; GFX10-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v2f16_ieee: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 -; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0 +; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v1, s2, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v1, s0, s0 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v1 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -749,11 +749,11 @@ ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 ; SI-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -776,40 +776,40 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 -; VI-NEXT: v_min_f16_e32 v1, v2, v1 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_min_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_max_f16_e64 v2, s8, s8 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v3f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s10, s10 ; GFX9-NEXT: v_pk_max_f16 v1, s12, s12 ; GFX9-NEXT: v_pk_max_f16 v2, s11, s11 -; GFX9-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX9-NEXT: v_pk_min_f16 v0, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX9-NEXT: v_pk_min_f16 v1, v2, v1 ; GFX9-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -817,11 +817,11 @@ ; GFX10-LABEL: minnum_v3f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -829,8 +829,8 @@ ; GFX10-NEXT: v_pk_max_f16 v2, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v0, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX10-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX10-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX10-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX10-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm @@ -838,23 +838,23 @@ ; GFX11-LABEL: minnum_v3f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v1, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v2, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v0, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, v2, v1 -; GFX11-NEXT: v_pk_min_f16 v0, v3, v0 +; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v2, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v0, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: v_pk_min_f16 v1, v1, v2 +; GFX11-NEXT: v_pk_min_f16 v0, v0, v3 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4 +; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, @@ -883,27 +883,27 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: s_lshr_b32 s6, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 -; SI-NEXT: s_lshr_b32 s6, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s7 +; SI-NEXT: s_lshr_b32 s6, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 +; SI-NEXT: v_min_f32_e32 v3, v3, v4 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v7 +; SI-NEXT: v_min_f32_e32 v1, v1, v4 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_min_f32_e32 v2, v2, v4 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 @@ -928,56 +928,56 @@ ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 ; VI-NEXT: s_lshr_b32 s0, s9, 16 -; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 -; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 -; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: s_lshr_b32 s1, s3, 16 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 +; VI-NEXT: v_max_f16_e64 v1, s1, s1 +; VI-NEXT: v_min_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_max_f16_e64 v2, s9, s9 +; VI-NEXT: v_min_f16_e32 v1, v1, v2 ; VI-NEXT: s_lshr_b32 s0, s8, 16 -; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_or_b32_e32 v1, v1, v0 +; VI-NEXT: v_max_f16_e64 v0, s0, s0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_min_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_max_f16_e64 v3, s8, s8 +; VI-NEXT: v_min_f16_e32 v2, v2, v3 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: minnum_v4f16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[12:13], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_pk_max_f16 v0, s11, s11 ; GFX9-NEXT: v_pk_max_f16 v1, s13, s13 ; GFX9-NEXT: v_pk_max_f16 v2, s10, s10 -; GFX9-NEXT: v_pk_min_f16 v1, v1, v0 +; GFX9-NEXT: v_pk_min_f16 v1, v0, v1 ; GFX9-NEXT: v_pk_max_f16 v0, s12, s12 -; GFX9-NEXT: v_pk_min_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_min_f16 v0, v2, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: minnum_v4f16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -985,29 +985,29 @@ ; GFX10-NEXT: v_pk_max_f16 v1, s9, s9 ; GFX10-NEXT: v_pk_max_f16 v2, s0, s0 ; GFX10-NEXT: v_pk_max_f16 v3, s8, s8 -; GFX10-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX10-NEXT: v_pk_min_f16 v0, v3, v2 +; GFX10-NEXT: v_pk_min_f16 v1, v0, v1 +; GFX10-NEXT: v_pk_min_f16 v0, v2, v3 ; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: minnum_v4f16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 -; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s5, s5 -; GFX11-NEXT: v_pk_max_f16 v1, s3, s3 -; GFX11-NEXT: v_pk_max_f16 v2, s4, s4 -; GFX11-NEXT: v_pk_max_f16 v3, s2, s2 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_min_f16 v1, v1, v0 -; GFX11-NEXT: v_pk_min_f16 v0, v3, v2 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: v_pk_max_f16 v0, s3, s3 +; GFX11-NEXT: v_pk_max_f16 v1, s1, s1 +; GFX11-NEXT: v_pk_max_f16 v2, s2, s2 +; GFX11-NEXT: v_pk_max_f16 v3, s0, s0 +; GFX11-NEXT: v_pk_min_f16 v1, v0, v1 +; GFX11-NEXT: v_pk_min_f16 v0, v2, v3 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ptr addrspace(1) %r, diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll @@ -16,7 +16,8 @@ ; Tests whether a load-chain of 8 constants of 64bit each gets vectorized into a wider load. ; FUNC-LABEL: {{^}}constant_load_2v4f64: -; GCN: s_load_dwordx16 +; GCN: s_load_dwordx8 +; GCN: s_load_dwordx8 define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocapture readonly %weights, ptr addrspace(1) noalias nocapture %out_ptr) { entry: %out_ptr.promoted = load double, ptr addrspace(1) %out_ptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -1584,101 +1584,116 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) { ; SI-LABEL: mul64_in_branch: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 -; SI-NEXT: s_and_b64 vcc, exec, s[10:11] -; SI-NEXT: s_cbranch_vccz .LBB12_4 +; SI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[4:5], 0 +; SI-NEXT: s_and_b64 vcc, exec, s[2:3] +; SI-NEXT: s_cbranch_vccz .LBB12_2 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mul_hi_u32 v0, s4, v0 -; SI-NEXT: s_mul_i32 s7, s4, s7 -; SI-NEXT: s_mul_i32 s5, s5, s6 -; SI-NEXT: s_mul_i32 s4, s4, s6 -; SI-NEXT: v_add_i32_e32 v0, vcc, s7, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_mul_i32 s2, s4, s7 +; SI-NEXT: s_mul_i32 s3, s5, s6 +; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; SI-NEXT: s_mul_i32 s2, s4, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: s_branch .LBB12_3 +; SI-NEXT: .LBB12_2: +; SI-NEXT: s_mov_b64 s[8:9], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SI-NEXT: .LBB12_3: ; %Flow +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz .LBB12_3 -; SI-NEXT: .LBB12_2: ; %if +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 vcc, vcc +; SI-NEXT: s_cbranch_vccnz .LBB12_5 +; SI-NEXT: ; %bb.4: ; %if ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: .LBB12_3: ; %endif +; SI-NEXT: .LBB12_5: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB12_4: -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB12_2 ; ; VI-LABEL: mul64_in_branch: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-NEXT: s_cbranch_scc0 .LBB12_4 +; VI-NEXT: s_cbranch_scc0 .LBB12_2 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0 -; VI-NEXT: s_mul_i32 s4, s4, s7 -; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 -; VI-NEXT: s_mul_i32 s4, s5, s6 -; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, 0 +; VI-NEXT: s_mul_i32 s2, s4, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: s_mul_i32 s2, s5, s6 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: s_branch .LBB12_3 +; VI-NEXT: .LBB12_2: +; VI-NEXT: s_mov_b64 s[8:9], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 +; VI-NEXT: .LBB12_3: ; %Flow +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; VI-NEXT: s_cbranch_vccnz .LBB12_3 -; VI-NEXT: .LBB12_2: ; %if +; VI-NEXT: s_cbranch_vccnz .LBB12_5 +; VI-NEXT: ; %bb.4: ; %if ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s2 ; VI-NEXT: s_mov_b32 s5, s3 ; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; VI-NEXT: .LBB12_3: ; %endif +; VI-NEXT: .LBB12_5: ; %endif +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB12_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_branch .LBB12_2 ; ; GFX9-LABEL: mul64_in_branch: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[8:9], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_cbranch_scc0 .LBB12_3 +; GFX9-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX9-NEXT: ; %bb.1: ; %else -; GFX9-NEXT: s_mul_i32 s7, s4, s7 -; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6 -; GFX9-NEXT: s_add_i32 s7, s10, s7 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_add_i32 s5, s7, s5 +; GFX9-NEXT: s_mul_i32 s2, s4, s7 +; GFX9-NEXT: s_mul_hi_u32 s3, s4, s6 +; GFX9-NEXT: s_add_i32 s2, s3, s2 +; GFX9-NEXT: s_mul_i32 s3, s5, s6 +; GFX9-NEXT: s_add_i32 s5, s2, s3 ; GFX9-NEXT: s_mul_i32 s4, s4, s6 +; GFX9-NEXT: s_branch .LBB12_3 +; GFX9-NEXT: .LBB12_2: +; GFX9-NEXT: s_mov_b64 s[8:9], -1 +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: .LBB12_3: ; %Flow +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GFX9-NEXT: s_cbranch_vccnz .LBB12_4 -; GFX9-NEXT: .LBB12_2: ; %if +; GFX9-NEXT: s_cbranch_vccnz .LBB12_5 +; GFX9-NEXT: ; %bb.4: ; %if ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s4, s2 ; GFX9-NEXT: s_mov_b32 s5, s3 ; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX9-NEXT: s_branch .LBB12_5 -; GFX9-NEXT: .LBB12_3: -; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-NEXT: s_branch .LBB12_2 -; GFX9-NEXT: .LBB12_4: +; GFX9-NEXT: s_branch .LBB12_6 +; GFX9-NEXT: .LBB12_5: ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: .LBB12_5: ; %endif +; GFX9-NEXT: .LBB12_6: ; %endif +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1687,34 +1702,39 @@ ; ; GFX10-LABEL: mul64_in_branch: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX10-NEXT: s_cbranch_scc0 .LBB12_3 +; GFX10-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX10-NEXT: ; %bb.1: ; %else -; GFX10-NEXT: s_mul_i32 s7, s4, s7 -; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX10-NEXT: s_mul_i32 s2, s4, s7 +; GFX10-NEXT: s_mul_hi_u32 s3, s4, s6 ; GFX10-NEXT: s_mul_i32 s5, s5, s6 -; GFX10-NEXT: s_add_i32 s7, s8, s7 +; GFX10-NEXT: s_add_i32 s2, s3, s2 ; GFX10-NEXT: s_mul_i32 s4, s4, s6 -; GFX10-NEXT: s_add_i32 s5, s7, s5 +; GFX10-NEXT: s_add_i32 s5, s2, s5 ; GFX10-NEXT: s_mov_b32 s6, 0 -; GFX10-NEXT: s_cbranch_execnz .LBB12_4 -; GFX10-NEXT: .LBB12_2: ; %if +; GFX10-NEXT: s_branch .LBB12_3 +; GFX10-NEXT: .LBB12_2: +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX10-NEXT: .LBB12_3: ; %Flow +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 +; GFX10-NEXT: s_cbranch_vccnz .LBB12_5 +; GFX10-NEXT: ; %bb.4: ; %if ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s4, s2 ; GFX10-NEXT: s_mov_b32 s5, s3 ; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; GFX10-NEXT: s_branch .LBB12_5 -; GFX10-NEXT: .LBB12_3: -; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX10-NEXT: s_branch .LBB12_2 -; GFX10-NEXT: .LBB12_4: +; GFX10-NEXT: s_branch .LBB12_6 +; GFX10-NEXT: .LBB12_5: ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: .LBB12_5: ; %endif +; GFX10-NEXT: .LBB12_6: ; %endif +; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1723,33 +1743,38 @@ ; ; GFX11-LABEL: mul64_in_branch: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX11-NEXT: s_cbranch_scc0 .LBB12_3 +; GFX11-NEXT: s_cbranch_scc0 .LBB12_2 ; GFX11-NEXT: ; %bb.1: ; %else -; GFX11-NEXT: s_mul_i32 s7, s4, s7 -; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6 +; GFX11-NEXT: s_mul_i32 s2, s4, s7 +; GFX11-NEXT: s_mul_hi_u32 s3, s4, s6 ; GFX11-NEXT: s_mul_i32 s5, s5, s6 -; GFX11-NEXT: s_add_i32 s7, s8, s7 +; GFX11-NEXT: s_add_i32 s2, s3, s2 ; GFX11-NEXT: s_mul_i32 s4, s4, s6 -; GFX11-NEXT: s_add_i32 s5, s7, s5 +; GFX11-NEXT: s_add_i32 s5, s2, s5 ; GFX11-NEXT: s_mov_b32 s6, 0 -; GFX11-NEXT: s_cbranch_execnz .LBB12_4 -; GFX11-NEXT: .LBB12_2: ; %if +; GFX11-NEXT: s_branch .LBB12_3 +; GFX11-NEXT: .LBB12_2: +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX11-NEXT: .LBB12_3: ; %Flow +; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX11-NEXT: s_cbranch_vccnz .LBB12_5 +; GFX11-NEXT: ; %bb.4: ; %if ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s4, s2 ; GFX11-NEXT: s_mov_b32 s5, s3 ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0 -; GFX11-NEXT: s_branch .LBB12_5 -; GFX11-NEXT: .LBB12_3: -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX11-NEXT: s_branch .LBB12_2 -; GFX11-NEXT: .LBB12_4: +; GFX11-NEXT: s_branch .LBB12_6 +; GFX11-NEXT: .LBB12_5: ; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-NEXT: .LBB12_5: ; %endif +; GFX11-NEXT: .LBB12_6: ; %endif +; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -85,7 +85,7 @@ ; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]] ; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]] -; GCN: ; %Flow4 +; GCN: ; %Flow3 ; GCN-NEXT: s_or_b64 exec, exec, ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]] ; GCN-NEXT: s_xor_b64 @@ -94,7 +94,7 @@ ; GCN-DAG: ds_write_b32 ; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec -; GCN: ; %Flow5 +; GCN: ; %Flow4 ; GCN-NEXT: s_or_b64 exec, exec, ; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -215,7 +215,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[6:7] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -467,12 +467,12 @@ ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -497,12 +497,12 @@ ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -637,12 +637,12 @@ ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 33 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 @@ -667,12 +667,12 @@ ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 33 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 @@ -704,12 +704,12 @@ ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -734,12 +734,12 @@ ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -771,12 +771,12 @@ ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 39 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 @@ -801,12 +801,12 @@ ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 39 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -309,8 +309,8 @@ ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v4, v2 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_or_b32_e32 v2, v3, v2 @@ -399,11 +399,11 @@ ; NOSDWA-NEXT: v_mov_b32_e32 v5, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v6, v1, v3 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v7, v0, v2 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v3 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v0, v0, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -500,17 +500,17 @@ ; NOSDWA-NEXT: v_mov_b32_e32 v9, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) ; NOSDWA-NEXT: v_mul_lo_u16_e32 v10, v3, v7 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 16, v7 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v11, v2, v6 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 16, v6 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v12, v1, v5 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v13, v0, v4 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v4 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v7 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v2, v6 ; NOSDWA-NEXT: v_mul_lo_u16_e32 v1, v1, v5 @@ -611,9 +611,9 @@ ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: flat_load_ushort v4, v[0:1] ; NOSDWA-NEXT: flat_load_ushort v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 @@ -629,9 +629,9 @@ ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_ushort v4, v[0:1] ; GFX89-NEXT: flat_load_ushort v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 @@ -683,8 +683,8 @@ ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] @@ -707,9 +707,9 @@ ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_dword v4, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 @@ -763,9 +763,9 @@ ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; NOSDWA-NEXT: v_mov_b32_e32 v4, s4 @@ -793,9 +793,9 @@ ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v4, s4 @@ -854,8 +854,8 @@ ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v4, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v0, s0 +; NOSDWA-NEXT: v_mov_b32_e32 v5, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s1 ; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -896,9 +896,9 @@ ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v4, s0 -; GFX89-NEXT: v_mov_b32_e32 v5, s1 ; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GFX89-NEXT: v_mov_b32_e32 v8, s4 @@ -1523,8 +1523,8 @@ ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 -; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 +; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: flat_load_dword v3, v[0:1] @@ -1547,8 +1547,8 @@ ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 -; GFX89-NEXT: v_mov_b32_e32 v2, s0 ; GFX89-NEXT: v_mov_b32_e32 v3, s1 +; GFX89-NEXT: v_mov_b32_e32 v2, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: flat_load_dword v3, v[0:1] @@ -1689,18 +1689,18 @@ ; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; NOSDWA-NEXT: flat_load_dword v4, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] -; NOSDWA-NEXT: flat_load_dword v3, v[0:1] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s5 ; NOSDWA-NEXT: s_waitcnt vmcnt(1) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v2 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v5, v5, v4 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v3, v2 -; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v5, v4 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v5 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v4, v4, v2 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, v3, v5 +; NOSDWA-NEXT: v_mul_lo_u16_e32 v2, v4, v2 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; NOSDWA-NEXT: v_or_b32_e32 v2, v2, v3 ; NOSDWA-NEXT: flat_store_dword v[0:1], v2 @@ -1783,9 +1783,9 @@ ; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; NOSDWA-NEXT: s_waitcnt lgkmcnt(0) ; NOSDWA-NEXT: v_mov_b32_e32 v0, s6 +; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: v_mov_b32_e32 v1, s7 ; NOSDWA-NEXT: v_mov_b32_e32 v2, s0 -; NOSDWA-NEXT: v_mov_b32_e32 v3, s1 ; NOSDWA-NEXT: flat_load_dword v1, v[0:1] ; NOSDWA-NEXT: flat_load_dword v2, v[2:3] ; NOSDWA-NEXT: v_mov_b32_e32 v0, s4 @@ -1808,9 +1808,9 @@ ; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s6 +; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: v_mov_b32_e32 v2, s0 -; GFX89-NEXT: v_mov_b32_e32 v3, s1 ; GFX89-NEXT: flat_load_dword v1, v[0:1] ; GFX89-NEXT: flat_load_dword v2, v[2:3] ; GFX89-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -12,11 +12,12 @@ ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_load_dword s2, s[0:1], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s7, s7, s2 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if @@ -158,45 +159,47 @@ define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 -; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc -; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] +; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB3_2: ; %Flow -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[10:11] +; SI-NEXT: s_or_saveexec_b64 s[6:7], s[2:3] +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_xor_b64 exec, exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], exec +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], exec ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] ; SI-NEXT: .LBB3_4: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -183,29 +183,30 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_shl_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s10, s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, s4, 64 -; GCN-NEXT: s_sub_i32 s12, 64, s4 -; GCN-NEXT: s_lshl_b64 s[6:7], s[2:3], s4 -; GCN-NEXT: s_lshl_b64 s[8:9], s[0:1], s4 -; GCN-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 -; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s12 -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s0, s0, s10 -; GCN-NEXT: s_cselect_b32 s1, s1, s11 -; GCN-NEXT: s_cselect_b32 s5, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s1, s3, s1 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: s_sub_i32 s8, 64, s10 +; GCN-NEXT: s_sub_i32 s6, s10, 64 +; GCN-NEXT: s_lshl_b64 s[4:5], s[2:3], s10 +; GCN-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GCN-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s10 +; GCN-NEXT: s_cmp_lt_u32 s10, 64 +; GCN-NEXT: s_cselect_b32 s4, s4, s6 +; GCN-NEXT: s_cselect_b32 s5, s5, s7 +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = shl i128 %lhs, %rhs @@ -216,29 +217,30 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_lshr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s10, s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, s4, 64 -; GCN-NEXT: s_sub_i32 s12, 64, s4 -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_lshr_b64 s[8:9], s[2:3], s4 -; GCN-NEXT: s_lshr_b64 s[10:11], s[2:3], s5 -; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s12 -; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s2, s2, s10 -; GCN-NEXT: s_cselect_b32 s3, s3, s11 -; GCN-NEXT: s_cselect_b32 s5, s9, 0 -; GCN-NEXT: s_cselect_b32 s6, s8, 0 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, s3 -; GCN-NEXT: s_cselect_b32 s0, s0, s2 +; GCN-NEXT: s_sub_i32 s8, 64, s10 +; GCN-NEXT: s_sub_i32 s6, s10, 64 +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s10 +; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 +; GCN-NEXT: s_cmp_lt_u32 s10, 64 +; GCN-NEXT: s_cselect_b32 s4, s4, s6 +; GCN-NEXT: s_cselect_b32 s5, s5, s7 +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: s_cselect_b32 s2, s2, 0 +; GCN-NEXT: s_cmp_eq_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, s5 +; GCN-NEXT: s_cselect_b32 s0, s0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = lshr i128 %lhs, %rhs @@ -249,26 +251,27 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) { ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s10, s[4:5], 0x4 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s5, 64, s4 -; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GCN-NEXT: s_sub_i32 s10, s4, 64 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], s5 -; GCN-NEXT: s_ashr_i32 s12, s3, 31 -; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], s10 -; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s4 -; GCN-NEXT: s_cmp_lt_u32 s4, 64 -; GCN-NEXT: s_cselect_b32 s3, s3, s12 -; GCN-NEXT: s_cselect_b32 s2, s2, s12 -; GCN-NEXT: s_cselect_b32 s5, s6, s10 -; GCN-NEXT: s_cselect_b32 s6, s7, s11 -; GCN-NEXT: s_cmp_eq_u32 s4, 0 -; GCN-NEXT: s_cselect_b32 s1, s1, s6 -; GCN-NEXT: s_cselect_b32 s0, s0, s5 +; GCN-NEXT: s_sub_i32 s6, 64, s10 +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], s10 +; GCN-NEXT: s_sub_i32 s8, s10, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 +; GCN-NEXT: s_ashr_i64 s[8:9], s[2:3], s8 +; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_ashr_i32 s6, s3, 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s10 +; GCN-NEXT: s_cmp_lt_u32 s10, 64 +; GCN-NEXT: s_cselect_b32 s3, s3, s6 +; GCN-NEXT: s_cselect_b32 s2, s2, s6 +; GCN-NEXT: s_cselect_b32 s4, s4, s8 +; GCN-NEXT: s_cselect_b32 s5, s5, s9 +; GCN-NEXT: s_cmp_eq_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, s5 +; GCN-NEXT: s_cselect_b32 s0, s0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -432,25 +435,26 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 ; GCN-NEXT: s_sub_i32 s22, 64, s8 ; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 ; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] -; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s8 +; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s22 +; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s8 ; GCN-NEXT: s_lshl_b64 s[20:21], s[0:1], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_or_b64 s[18:19], s[22:23], s[18:19] ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec ; GCN-NEXT: s_cselect_b32 s19, s19, s21 ; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec ; GCN-NEXT: s_cselect_b32 s9, s3, s19 ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec @@ -504,25 +508,26 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 ; GCN-NEXT: s_sub_i32 s22, 64, s8 ; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 ; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] -; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 +; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s22 +; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s8 ; GCN-NEXT: s_lshr_b64 s[20:21], s[2:3], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_or_b64 s[18:19], s[22:23], s[18:19] ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec ; GCN-NEXT: s_cselect_b32 s19, s19, s21 ; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec ; GCN-NEXT: s_cselect_b32 s9, s1, s19 ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec @@ -576,25 +581,26 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v6, 16 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 ; GCN-NEXT: s_sub_i32 s22, 64, s8 ; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 ; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] -; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 +; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s22 +; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s8 ; GCN-NEXT: s_ashr_i64 s[20:21], s[2:3], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] +; GCN-NEXT: s_or_b64 s[18:19], s[22:23], s[18:19] ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec ; GCN-NEXT: s_cselect_b32 s19, s19, s21 ; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] ; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec ; GCN-NEXT: s_cselect_b32 s9, s1, s19 ; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec diff --git a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll --- a/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ b/llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -3,23 +3,32 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(ptr addrspace(4) %wei_ptr, ptr addrspace(1) %out_ptr, ptr addrspace(1) %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader -; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword -; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 -; CHECK: s_load_dwordx16 -; CHECK: s_load_dwordx16 -; CHECK: s_load_dwordx16 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 +; CHECK-NEXT: s_load_dwordx8 ; CHECK-NOT: v_writelane_b32 ; CHECK-NOT: v_readlane_b32 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -187,7 +187,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end @@ -449,12 +449,12 @@ ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -484,12 +484,12 @@ ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 @@ -526,12 +526,12 @@ ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -561,12 +561,12 @@ ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 @@ -657,12 +657,12 @@ ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 39 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 @@ -692,12 +692,12 @@ ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 39 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 @@ -734,12 +734,12 @@ ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 33 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 @@ -769,12 +769,12 @@ ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 33 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 @@ -1088,7 +1088,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 -; GCN-IR-NEXT: .LBB8_4: ; %Flow6 +; GCN-IR-NEXT: .LBB8_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -126,8 +126,8 @@ ; VI-NEXT: s_lshr_b32 s5, s7, 16 ; VI-NEXT: s_sub_i32 s6, s6, s7 ; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s5, s6, 0xffff ; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s6, 0xffff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -111,11 +111,11 @@ ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 -; VI-NEXT: s_mul_i32 s1, s1, s0 -; VI-NEXT: s_lshr_b32 s0, s1, 16 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_sext_i32_i16 s2, s2 +; VI-NEXT: s_sext_i32_i16 s3, s3 +; VI-NEXT: s_mul_i32 s3, s3, s2 +; VI-NEXT: s_lshr_b32 s2, s3, 16 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -706,83 +706,88 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_uaddo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e64 v0, s[0:1], v1, v2 +; SI-NEXT: v_add_i32_e64 v0, s[2:3], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; SI-NEXT: .LBB9_2: ; %exit -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_uaddo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; VI-NEXT: v_add_u32_e64 v0, s[0:1], v1, v2 +; VI-NEXT: v_add_u32_e64 v0, s[2:3], v1, v2 ; VI-NEXT: s_cbranch_vccnz .LBB9_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; VI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; VI-NEXT: .LBB9_2: ; %exit -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; VI-NEXT: flat_store_byte v[3:4], v0 +; VI-NEXT: flat_store_byte v[3:4], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_uaddo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v2, v3 +; GFX9-NEXT: v_add_co_u32_e64 v1, s[2:3], v2, v3 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: .LBB9_2: ; %exit -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -188,7 +188,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[2:3] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -187,7 +187,7 @@ ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 -; GCN-IR-NEXT: .LBB0_4: ; %Flow6 +; GCN-IR-NEXT: .LBB0_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -707,83 +707,88 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; SI-LABEL: v_usubo_clamp_bit: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 ; SI-NEXT: s_mov_b32 s12, s10 ; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2 +; SI-NEXT: v_sub_i32_e64 v0, s[2:3], v1, v2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; SI-NEXT: .LBB9_2: ; %exit -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_usubo_clamp_bit: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; VI-NEXT: v_sub_u32_e64 v0, s[0:1], v1, v2 +; VI-NEXT: v_sub_u32_e64 v0, s[2:3], v1, v2 ; VI-NEXT: s_cbranch_vccnz .LBB9_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; VI-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; VI-NEXT: .LBB9_2: ; %exit -; VI-NEXT: v_mov_b32_e32 v1, s4 -; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: v_mov_b32_e32 v3, s6 -; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 ; VI-NEXT: flat_store_dword v[1:2], v0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; VI-NEXT: flat_store_byte v[3:4], v0 +; VI-NEXT: flat_store_byte v[3:4], v5 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_usubo_clamp_bit: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[8:9] -; GFX9-NEXT: global_load_dword v3, v0, s[10:11] +; GFX9-NEXT: global_load_dword v2, v0, s[4:5] +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX9-NEXT: v_sub_co_u32_e64 v1, s[0:1], v2, v3 +; GFX9-NEXT: v_sub_co_u32_e64 v1, s[2:3], v2, v3 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], -1 ; GFX9-NEXT: .LBB9_2: ; %exit -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_byte v0, v2, s[2:3] ; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1265,26 +1265,25 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) #0 { ; GFX1032-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_clause 0x1 ; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1032-NEXT: s_mov_b32 null, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 ; GFX1032-NEXT: s_mov_b32 vcc_lo, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, s2 ; GFX1032-NEXT: s_cbranch_execz .LBB22_2 ; GFX1032-NEXT: ; %bb.1: ; %bb +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: global_load_dword v0, v0, s[2:3] glc dlc +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: global_load_dword v0, v0, s[0:1] glc dlc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_b32 vcc_lo, vcc_lo, exec_lo ; GFX1032-NEXT: .LBB22_2: ; %exit ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 @@ -1293,26 +1292,25 @@ ; ; GFX1064-LABEL: test_div_fmas_f32_i1_phi_vcc: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_clause 0x1 ; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX1064-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX1064-NEXT: s_mov_b32 null, 0 -; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v0 ; GFX1064-NEXT: s_mov_b64 vcc, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: global_load_dwordx3 v[1:3], v1, s[6:7] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] ; GFX1064-NEXT: s_cbranch_execz .LBB22_2 ; GFX1064-NEXT: ; %bb.1: ; %bb +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: global_load_dword v0, v0, s[8:9] glc dlc +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: global_load_dword v0, v0, s[0:1] glc dlc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_b64 vcc, vcc, exec ; GFX1064-NEXT: .LBB22_2: ; %exit -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -112,8 +112,7 @@ ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_ashr_i32 s3, s2, 31 ; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s9 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] ; CHECK-NEXT: s_add_u32 s0, s4, s0 ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] ; CHECK-NEXT: s_addc_u32 s1, s5, s1