diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -478,11 +478,7 @@ ? MRI.getRegClass(Reg) : RI.getPhysRegClass(Reg); - // FIXME: NumLoads should not be subtracted 1. This is to match behavior - // of clusterNeighboringMemOps which was previosly passing cluster length - // less 1. LoadClusterThreshold should be tuned instead. - return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= - LoadClusterThreshold; + return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll @@ -162,15 +162,15 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { ; GFX6-LABEL: bfe_u32_zextload_i8: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX6-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 8 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm %load = load i8, i8 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -214,43 +214,44 @@ ; ; GCN-LABEL: sdiv_i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s3, 31 -; GCN-NEXT: s_add_i32 s3, s3, s8 -; GCN-NEXT: s_xor_b32 s9, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s3 +; GCN-NEXT: s_ashr_i32 s8, s5, 31 +; GCN-NEXT: s_add_i32 s2, s5, s8 +; GCN-NEXT: s_xor_b32 s11, s2, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GCN-NEXT: s_ashr_i32 s9, s4, 31 +; GCN-NEXT: s_add_i32 s4, s4, s9 +; GCN-NEXT: s_xor_b32 s10, s4, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s3, s3, s8 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s11 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: s_xor_b32 s2, s9, s8 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s11 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s2, v1 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, s10, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %r = sdiv i32 %x, %y @@ -480,20 +481,20 @@ ; ; GCN-LABEL: sdiv_i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s1, s0, 16 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GCN-NEXT: s_sext_i32_i16 s0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_ashr_i32 s5, s4, 16 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GCN-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -501,7 +502,7 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv i16 %x, %y store i16 %r, i16 addrspace(1)* %out @@ -691,20 +692,20 @@ ; ; GCN-LABEL: sdiv_i8: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GCN-NEXT: s_sext_i32_i8 s0, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_bfe_i32 s5, s4, 0x80008 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GCN-NEXT: s_sext_i32_i8 s4, s4 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -712,7 +713,7 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv i8 %x, %y store i8 %r, i8 addrspace(1)* %out @@ -1237,14 +1238,14 @@ ; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 ; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v3, v1, s13 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_hi_u32 v2, v1, s13 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 ; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 @@ -1868,84 +1869,83 @@ ; GCN-LABEL: srem_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[12:19], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s20, 0x4f800000 ; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s2, s16, 31 ; GCN-NEXT: s_add_i32 s3, s16, s2 -; GCN-NEXT: s_xor_b32 s5, s3, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GCN-NEXT: s_mov_b32 s16, 0x4f800000 +; GCN-NEXT: s_xor_b32 s16, s3, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 ; GCN-NEXT: s_ashr_i32 s6, s12, 31 -; GCN-NEXT: s_ashr_i32 s2, s17, 31 +; GCN-NEXT: s_ashr_i32 s4, s17, 31 +; GCN-NEXT: s_add_i32 s2, s12, s6 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_add_i32 s0, s12, s6 -; GCN-NEXT: s_add_i32 s3, s17, s2 -; GCN-NEXT: s_xor_b32 s4, s0, s6 -; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 +; GCN-NEXT: s_add_i32 s5, s17, s4 +; GCN-NEXT: s_xor_b32 s7, s2, s6 +; GCN-NEXT: s_xor_b32 s17, s5, s4 +; GCN-NEXT: v_mul_f32_e32 v0, s20, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s17, s3, s2 -; GCN-NEXT: s_ashr_i32 s7, s13, 31 -; GCN-NEXT: s_add_i32 s12, s13, s7 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s5 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s5 -; GCN-NEXT: s_xor_b32 s12, s12, s7 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s16 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s16 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s17 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s4 -; GCN-NEXT: v_mul_f32_e32 v1, s16, v1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[2:3] +; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 +; GCN-NEXT: v_mul_f32_e32 v1, s20, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s16 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s17 ; GCN-NEXT: v_mul_hi_u32 v5, v1, s17 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s4, v0 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v2 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v0 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v0 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, s16, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s16, v2 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: s_ashr_i32 s0, s18, 31 +; GCN-NEXT: s_ashr_i32 s7, s13, 31 ; GCN-NEXT: s_add_i32 s1, s18, s0 +; GCN-NEXT: s_add_i32 s12, s13, s7 ; GCN-NEXT: s_xor_b32 s13, s1, s0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GCN-NEXT: v_cvt_f32_u32_e32 v2, s13 +; GCN-NEXT: s_xor_b32 s12, s12, s7 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v1, v1, s12 -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 ; GCN-NEXT: v_mul_lo_u32 v1, v1, s17 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 -; GCN-NEXT: v_mul_f32_e32 v2, s16, v2 +; GCN-NEXT: v_mul_f32_e32 v2, s20, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s12, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v1 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 ; GCN-NEXT: v_mul_lo_u32 v5, v2, s13 ; GCN-NEXT: v_mul_hi_u32 v6, v2, s13 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s17, v3 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s17, v3 ; GCN-NEXT: v_sub_i32_e32 v7, vcc, 0, v5 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v5, v5, v2 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s17, v3 ; GCN-NEXT: s_ashr_i32 s6, s14, 31 ; GCN-NEXT: s_add_i32 s12, s14, s6 -; GCN-NEXT: s_xor_b32 s12, s12, s6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v5, v2 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v5, v2 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] @@ -1955,36 +1955,37 @@ ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GCN-NEXT: v_cvt_f32_u32_e32 v3, s14 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GCN-NEXT: s_xor_b32 s12, s12, s6 ; GCN-NEXT: v_mul_hi_u32 v2, v2, s12 -; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v1, s7, v1 ; GCN-NEXT: v_mul_lo_u32 v2, v2, s13 -; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 -; GCN-NEXT: v_mul_f32_e32 v3, s16, v3 +; GCN-NEXT: v_mul_f32_e32 v3, s20, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s7, v1 ; GCN-NEXT: s_ashr_i32 s7, s15, 31 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v3, s14 ; GCN-NEXT: v_mul_hi_u32 v7, v3, s14 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v2 ; GCN-NEXT: s_add_i32 s12, s15, s7 -; GCN-NEXT: s_xor_b32 s12, s12, s7 ; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v6 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7 ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v6, v6, v3 +; GCN-NEXT: s_xor_b32 s12, s12, s7 ; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, s13, v4 -; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s13, v4 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v6, v3 ; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v3, v3, s12 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s13, v4 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GCN-NEXT: v_mul_lo_u32 v3, v3, s14 +; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v2, s6, v2 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s6, v2 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v3 @@ -2837,20 +2838,20 @@ ; ; GCN-LABEL: sdiv_i3: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 -; GCN-NEXT: s_bfe_i32 s0, s0, 0x30000 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 -; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_bfe_i32 s5, s4, 0x30008 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GCN-NEXT: s_bfe_i32 s4, s4, 0x30000 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GCN-NEXT: s_xor_b32 s4, s4, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 @@ -2859,7 +2860,7 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 7, v0 -; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv i3 %x, %y store i3 %r, i3 addrspace(1)* %out @@ -2995,21 +2996,21 @@ ; ; GCN-LABEL: udiv_v3i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd ; GCN-NEXT: s_mov_b32 s8, 0xffff -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s0, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GCN-NEXT: s_and_b32 s6, s2, s8 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GCN-NEXT: s_and_b32 s9, s4, s8 +; GCN-NEXT: s_and_b32 s2, s6, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s0, s2, 16 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GCN-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 @@ -3018,16 +3019,16 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 -; GCN-NEXT: s_and_b32 s0, s1, s8 +; GCN-NEXT: s_and_b32 s4, s7, s8 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GCN-NEXT: s_and_b32 s0, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GCN-NEXT: s_and_b32 s4, s5, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 @@ -3038,8 +3039,8 @@ ; GCN-NEXT: v_and_b32_e32 v0, s8, v0 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -3259,18 +3260,18 @@ ; ; GCN-LABEL: sdiv_v3i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s9, s2 -; GCN-NEXT: s_sext_i32_i16 s8, s0 +; GCN-NEXT: s_sext_i32_i16 s9, s4 +; GCN-NEXT: s_sext_i32_i16 s8, s6 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 ; GCN-NEXT: s_xor_b32 s8, s9, s8 -; GCN-NEXT: s_ashr_i32 s0, s0, 16 +; GCN-NEXT: s_ashr_i32 s6, s6, 16 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-NEXT: s_or_b32 s8, s8, 1 @@ -3280,44 +3281,44 @@ ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: s_ashr_i32 s2, s2, 16 +; GCN-NEXT: s_ashr_i32 s4, s4, 16 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GCN-NEXT: s_xor_b32 s0, s2, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: s_xor_b32 s4, s4, s6 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 ; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: s_sext_i32_i16 s0, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: s_sext_i32_i16 s4, s7 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| ; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GCN-NEXT: s_sext_i32_i16 s1, s3 +; GCN-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: s_xor_b32 s4, s5, s4 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 ; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v5, s0 +; GCN-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -3409,74 +3410,74 @@ ; ; GCN-LABEL: srem_v3i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s8, s2 -; GCN-NEXT: s_sext_i32_i16 s6, s0 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 -; GCN-NEXT: s_xor_b32 s6, s8, s6 -; GCN-NEXT: s_ashr_i32 s6, s6, 30 +; GCN-NEXT: s_sext_i32_i16 s9, s4 +; GCN-NEXT: s_sext_i32_i16 s8, s6 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 +; GCN-NEXT: s_xor_b32 s8, s9, s8 +; GCN-NEXT: s_ashr_i32 s8, s8, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s6, s6, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_or_b32 s8, s8, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 16 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_alignbit_b32 v2, s7, v2, 16 ; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v4, v3 -; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 +; GCN-NEXT: v_alignbit_b32 v1, s5, v1, 16 ; GCN-NEXT: v_bfe_i32 v5, v1, 0, 16 ; GCN-NEXT: v_cvt_f32_i32_e32 v6, v5 ; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 ; GCN-NEXT: v_xor_b32_e32 v3, v5, v3 -; GCN-NEXT: s_sext_i32_i16 s0, s1 +; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 ; GCN-NEXT: v_mul_f32_e32 v5, v6, v7 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_mad_f32 v6, -v5, v4, v6 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GCN-NEXT: s_sext_i32_i16 s4, s7 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| -; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 +; GCN-NEXT: v_cvt_f32_i32_e32 v4, s4 ; GCN-NEXT: v_or_b32_e32 v3, 1, v3 ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: s_sext_i32_i16 s2, s3 +; GCN-NEXT: s_sext_i32_i16 s6, s5 ; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 +; GCN-NEXT: v_cvt_f32_i32_e32 v3, s6 ; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4 -; GCN-NEXT: s_xor_b32 s0, s2, s0 -; GCN-NEXT: s_ashr_i32 s0, s0, 30 -; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: s_xor_b32 s4, s6, s4 +; GCN-NEXT: s_ashr_i32 s4, s4, 30 +; GCN-NEXT: s_or_b32 s4, s4, 1 ; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 ; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 ; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GCN-NEXT: v_mov_b32_e32 v6, s0 +; GCN-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 +; GCN-NEXT: v_mul_lo_u32 v3, v3, s7 ; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = srem <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -4081,18 +4082,18 @@ ; ; GCN-LABEL: udiv_i32_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out @@ -4188,20 +4189,20 @@ ; ; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: v_mov_b32_e32 v0, 0x100101 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 -; GCN-NEXT: s_lshr_b32 s0, s0, 12 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s5, v0 +; GCN-NEXT: s_lshr_b32 s4, s4, 12 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 11, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, <2 x i32> addrspace(1)* %out @@ -4298,61 +4299,61 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_movk_i32 s4, 0x1000 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s2, s4, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_lshl_b32 s10, s4, s3 -; GCN-NEXT: s_mov_b32 s3, 0x4f800000 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 +; GCN-NEXT: s_lshl_b32 s10, s4, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GCN-NEXT: s_mov_b32 s2, 0x4f800000 +; GCN-NEXT: s_lshl_b32 s11, s4, s3 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v0, s3, v0 +; GCN-NEXT: v_mul_f32_e32 v0, s2, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 +; GCN-NEXT: v_mul_f32_e32 v1, s2, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s10 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s10 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s11 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 -; GCN-NEXT: v_mul_hi_u32 v2, v1, s10 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_mul_hi_u32 v2, v1, s11 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v2, v2, v1 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 +; GCN-NEXT: v_mul_lo_u32 v5, v0, s10 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] ; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v5 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v3 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s10 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s11 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s8, v5 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v4 ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 ; GCN-NEXT: v_add_i32_e32 v2, vcc, -1, v1 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s9, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[2:3] +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -4369,20 +4370,20 @@ ; ; GCN-LABEL: urem_i32_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x12d8fb, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = urem i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out @@ -4557,6 +4558,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_movk_i32 s4, 0x1000 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -4567,7 +4569,6 @@ ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GCN-NEXT: v_mul_f32_e32 v0, s2, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -4576,16 +4577,15 @@ ; GCN-NEXT: v_mul_lo_u32 v2, v0, s10 ; GCN-NEXT: v_mul_hi_u32 v3, v0, s10 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v2, v2, v0 ; GCN-NEXT: v_mul_lo_u32 v3, v1, s11 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v2, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 ; GCN-NEXT: v_mul_hi_u32 v2, v1, s11 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v0, v0, s8 ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] @@ -4612,6 +4612,7 @@ ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -4628,18 +4629,18 @@ ; ; GCN-LABEL: sdiv_i32_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mul_hi_i32 v0, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = sdiv i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out @@ -4680,44 +4681,45 @@ ; ; GCN-LABEL: sdiv_i32_pow2_shl_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GCN-NEXT: s_ashr_i32 s8, s3, 31 -; GCN-NEXT: s_add_i32 s3, s3, s8 -; GCN-NEXT: s_xor_b32 s9, s3, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_add_i32 s2, s2, s3 -; GCN-NEXT: s_xor_b32 s2, s2, s3 +; GCN-NEXT: s_lshl_b32 s2, 0x1000, s5 +; GCN-NEXT: s_ashr_i32 s8, s2, 31 +; GCN-NEXT: s_add_i32 s2, s2, s8 +; GCN-NEXT: s_xor_b32 s11, s2, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s11 +; GCN-NEXT: s_ashr_i32 s9, s4, 31 +; GCN-NEXT: s_add_i32 s4, s4, s9 +; GCN-NEXT: s_xor_b32 s10, s4, s9 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s3, s3, s8 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s9 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s11 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s11 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_mul_hi_u32 v0, v0, s2 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] +; GCN-NEXT: v_mul_hi_u32 v0, v0, s10 +; GCN-NEXT: s_xor_b32 s2, s9, s8 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s11 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v1 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s10, v1 +; GCN-NEXT: v_sub_i32_e32 v1, vcc, s10, v1 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v1 ; GCN-NEXT: s_and_b64 vcc, vcc, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm %shl.y = shl i32 4096, %y @@ -4905,56 +4907,56 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd ; GCN-NEXT: s_movk_i32 s4, 0x1000 -; GCN-NEXT: s_mov_b32 s14, 0x4f800000 -; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s15, 0x4f800000 ; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb ; GCN-NEXT: s_mov_b32 s11, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s2, s4, s2 ; GCN-NEXT: s_ashr_i32 s5, s2, 31 ; GCN-NEXT: s_add_i32 s2, s2, s5 -; GCN-NEXT: s_xor_b32 s13, s2, s5 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 -; GCN-NEXT: s_ashr_i32 s2, s6, 31 -; GCN-NEXT: s_lshl_b32 s0, s4, s3 -; GCN-NEXT: s_add_i32 s1, s6, s2 +; GCN-NEXT: s_xor_b32 s14, s2, s5 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GCN-NEXT: s_lshl_b32 s2, s4, s3 +; GCN-NEXT: s_ashr_i32 s4, s6, 31 +; GCN-NEXT: s_add_i32 s3, s6, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: s_ashr_i32 s6, s0, 31 -; GCN-NEXT: s_add_i32 s4, s0, s6 -; GCN-NEXT: s_xor_b32 s3, s1, s2 -; GCN-NEXT: v_mul_f32_e32 v0, s14, v0 +; GCN-NEXT: s_ashr_i32 s6, s2, 31 +; GCN-NEXT: s_add_i32 s8, s2, s6 +; GCN-NEXT: s_xor_b32 s12, s3, s4 +; GCN-NEXT: v_mul_f32_e32 v0, s15, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b32 s15, s4, s6 -; GCN-NEXT: s_xor_b32 s12, s2, s5 +; GCN-NEXT: s_xor_b32 s16, s8, s6 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 +; GCN-NEXT: s_xor_b32 s13, s4, s5 +; GCN-NEXT: v_mul_lo_u32 v1, v0, s14 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s14 ; GCN-NEXT: s_mov_b32 s10, -1 -; GCN-NEXT: v_mul_lo_u32 v1, v0, s13 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s13 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], 0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[2:3] ; GCN-NEXT: v_mul_hi_u32 v1, v1, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s16 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v0 ; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[2:3] ; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v2 -; GCN-NEXT: v_mul_hi_u32 v0, v0, s3 -; GCN-NEXT: v_mul_f32_e32 v1, s14, v1 -; GCN-NEXT: v_mul_lo_u32 v2, v0, s13 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s12 +; GCN-NEXT: v_mul_f32_e32 v1, s15, v1 +; GCN-NEXT: v_mul_lo_u32 v2, v0, s14 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v0 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, s15 -; GCN-NEXT: v_mul_hi_u32 v5, v1, s15 -; GCN-NEXT: s_ashr_i32 s13, s7, 31 -; GCN-NEXT: s_add_i32 s7, s7, s13 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s12, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s16 +; GCN-NEXT: v_mul_hi_u32 v5, v1, s16 +; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s12, v2 +; GCN-NEXT: s_ashr_i32 s12, s7, 31 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 ; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] ; GCN-NEXT: v_mul_hi_u32 v4, v4, v1 -; GCN-NEXT: s_xor_b32 s7, s7, s13 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s3, v2 +; GCN-NEXT: s_add_i32 s7, s7, s12 +; GCN-NEXT: s_xor_b32 s7, s7, s12 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v4, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, v4, v1 @@ -4963,12 +4965,12 @@ ; GCN-NEXT: s_and_b64 vcc, s[0:1], s[2:3] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v2, v1, s15 -; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GCN-NEXT: s_xor_b32 s4, s13, s6 +; GCN-NEXT: v_mul_lo_u32 v2, v1, s16 +; GCN-NEXT: v_xor_b32_e32 v0, s13, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s13, v0 +; GCN-NEXT: s_xor_b32 s4, s12, s6 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v3 ; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], s7, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, -1, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 @@ -4977,6 +4979,7 @@ ; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[2:3] ; GCN-NEXT: v_xor_b32_e32 v1, s4, v1 ; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -4993,20 +4996,20 @@ ; ; GCN-LABEL: srem_i32_oddk_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_mul_hi_i32 v0, s4, v0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, 0x12d8fb, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %r = srem i32 %x, 1235195 store i32 %r, i32 addrspace(1)* %out @@ -5484,13 +5487,13 @@ ; ; GCN-LABEL: udiv_i64_pow2_shl_denom: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dword s8, s[0:1], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_add_i32 s8, s8, 12 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -5551,79 +5554,79 @@ ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s6, 0xf001 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v7, 0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_movk_i32 s0, 0xfff -; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s6 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: s_movk_i32 s0, 0xfff +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 -; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 -; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GCN-NEXT: v_mul_hi_u32 v4, v0, s6 +; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] +; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 +; GCN-NEXT: v_mul_lo_u32 v6, v0, s6 +; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc -; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc +; GCN-NEXT: v_mul_lo_u32 v10, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[2:3] +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 ; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 ; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 ; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 ; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 @@ -6333,9 +6336,8 @@ ; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s6, 0xf001 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -6364,6 +6366,7 @@ ; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 ; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GCN-NEXT: v_mov_b32_e32 v4, 0 @@ -6494,8 +6497,8 @@ ; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_subb_u32 s7, 0, s15 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 ; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -7136,8 +7139,8 @@ ; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_subb_u32 s7, 0, s17 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 ; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -330,36 +330,35 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, 0xff00ff +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s2, 0xff00ff ; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f ; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0 ; SI-NEXT: s_mov_b32 s10, 0x33333333 ; SI-NEXT: s_mov_b32 s11, 0xcccccccc -; SI-NEXT: s_mov_b32 s0, 0x55555555 -; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s12, 0x55555555 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s6, v0, v2 -; SI-NEXT: v_bfi_b32 v4, s6, v1, v3 +; SI-NEXT: v_bfi_b32 v2, s2, v0, v2 +; SI-NEXT: v_bfi_b32 v4, s2, v1, v3 ; SI-NEXT: v_and_b32_e32 v1, s8, v2 ; SI-NEXT: v_and_b32_e32 v0, s8, v4 ; SI-NEXT: v_and_b32_e32 v3, s9, v2 ; SI-NEXT: v_and_b32_e32 v2, s9, v4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, 0xaaaaaaaa ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_and_b32_e32 v1, s10, v3 @@ -370,64 +369,66 @@ ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s0, v3 -; SI-NEXT: v_and_b32_e32 v0, s0, v2 -; SI-NEXT: v_and_b32_e32 v3, s1, v3 -; SI-NEXT: v_and_b32_e32 v2, s1, v2 +; SI-NEXT: v_and_b32_e32 v1, s12, v3 +; SI-NEXT: v_and_b32_e32 v0, s12, v2 +; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v2, s2, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s6, 0x10203 -; FLAT-NEXT: s_mov_b32 s2, 0x33333333 -; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s8, 0x10203 +; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s5, 0x33333333 +; FLAT-NEXT: s_mov_b32 s7, 0x55555555 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6 -; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v4 +; FLAT-NEXT: v_perm_b32 v2, 0, v0, s8 +; FLAT-NEXT: v_perm_b32 v4, 0, v1, s8 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v4 +; FLAT-NEXT: v_and_b32_e32 v3, s4, v2 +; FLAT-NEXT: v_and_b32_e32 v2, s4, v4 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: s_mov_b32 s0, 0x55555555 +; FLAT-NEXT: s_mov_b32 s2, 0xaaaaaaaa ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s5, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s5, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s6, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s6, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] -; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s7, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s7, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s2, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -843,22 +843,22 @@ ; GCN-LABEL: {{^}}stack_8xv5i32: -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG8]], {{.*$}} -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-DAG: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-DAG: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN-DAG: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN-DAG: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-DAG: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-DAG: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-DAG: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN-DAG: buffer_store_dword [[REG8]], {{.*$}} +; GCN-DAG: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-DAG: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-DAG: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-DAG: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-DAG: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-DAG: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-DAG: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 7 ; GCN: s_getpc @@ -877,22 +877,22 @@ } ; GCN-LABEL: {{^}}stack_8xv5f32: -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG8]], {{.*$}} -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-DAG: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 +; GCN-DAG: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; GCN-DAG: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; GCN-DAG: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; GCN-DAG: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN-DAG: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN-DAG: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN-DAG: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN-DAG: buffer_store_dword [[REG8]], {{.*$}} +; GCN-DAG: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-DAG: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-DAG: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-DAG: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-DAG: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-DAG: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-DAG: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 0x40e00000 ; GCN: s_getpc diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -50,37 +50,38 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -45,11 +45,11 @@ ; FUNC-LABEL: {{^}}v_ctpop_i64_user: ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 -; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} +; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; SI-DAG-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-DAG-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; GCN: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -357,29 +357,28 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:6 ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v2, v7, v6 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v0 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 +; SI-NEXT: v_or_b32_e32 v2, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:24 +; SI-NEXT: buffer_store_dword v6, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -407,27 +406,27 @@ ; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v6, v[6:7] -; VI-NEXT: flat_load_ubyte v7, v[8:9] -; VI-NEXT: flat_load_ubyte v8, v[10:11] -; VI-NEXT: flat_load_ubyte v9, v[12:13] +; VI-NEXT: flat_load_ubyte v8, v[8:9] +; VI-NEXT: flat_load_ubyte v9, v[10:11] +; VI-NEXT: flat_load_ubyte v10, v[12:13] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: flat_load_ubyte v1, v[2:3] ; VI-NEXT: flat_load_ubyte v2, v[4:5] +; VI-NEXT: flat_load_ubyte v3, v[6:7] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 -; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v9 +; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v1 +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v3 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v10 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -2443,12 +2443,12 @@ } ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: -; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] -; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] -; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 +; GCN-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 ; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1196,9 +1196,9 @@ ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; SI-NEXT: s_mov_b32 s4, 0xffffff ; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab @@ -1212,32 +1212,31 @@ ; SI-NEXT: v_and_b32_e32 v2, s4, v2 ; SI-NEXT: v_mul_hi_u32 v12, v2, s5 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v3, s4, v3 -; SI-NEXT: v_mul_hi_u32 v13, v3, s5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, s4, v4 +; SI-NEXT: v_and_b32_e32 v11, s4, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v5, s4, v5 +; SI-NEXT: v_mul_hi_u32 v13, v5, s5 ; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 ; SI-NEXT: v_mul_lo_u32 v12, v12, 24 ; SI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 ; SI-NEXT: v_mul_lo_u32 v13, v13, 24 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 ; SI-NEXT: v_lshr_b32_e32 v12, v14, v2 -; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v13 +; SI-NEXT: v_sub_i32_e32 v5, vcc, v5, v13 ; SI-NEXT: v_sub_i32_e32 v13, vcc, 24, v2 -; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v3 +; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v5 ; SI-NEXT: v_and_b32_e32 v13, s4, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshl_b32_e32 v5, v5, v13 +; SI-NEXT: v_lshl_b32_e32 v4, v4, v13 ; SI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 -; SI-NEXT: v_lshr_b32_e32 v11, v11, v3 +; SI-NEXT: v_lshr_b32_e32 v11, v11, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b32_e32 v6, v6, v14 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_or_b32_e32 v4, v4, v12 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; SI-NEXT: v_or_b32_e32 v6, v6, v11 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; SI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen ; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 @@ -1256,9 +1255,9 @@ ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; VI-NEXT: s_mov_b32 s4, 0xffffff ; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab @@ -1272,32 +1271,31 @@ ; VI-NEXT: v_and_b32_e32 v2, s4, v2 ; VI-NEXT: v_mul_hi_u32 v12, v2, s5 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_and_b32_e32 v3, s4, v3 -; VI-NEXT: v_mul_hi_u32 v13, v3, s5 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_and_b32_e32 v11, s4, v4 +; VI-NEXT: v_and_b32_e32 v11, s4, v3 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_and_b32_e32 v5, s4, v5 +; VI-NEXT: v_mul_hi_u32 v13, v5, s5 ; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 ; VI-NEXT: v_mul_lo_u32 v12, v12, 24 ; VI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 ; VI-NEXT: v_mul_lo_u32 v13, v13, 24 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 ; VI-NEXT: v_lshrrev_b32_e32 v12, v2, v14 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v13 +; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v13 ; VI-NEXT: v_sub_u32_e32 v13, vcc, 24, v2 -; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v3 +; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v5 ; VI-NEXT: v_and_b32_e32 v13, s4, v13 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v5, v13, v5 +; VI-NEXT: v_lshlrev_b32_e32 v4, v13, v4 ; VI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 -; VI-NEXT: v_lshrrev_b32_e32 v11, v3, v11 +; VI-NEXT: v_lshrrev_b32_e32 v11, v5, v11 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v6, v14, v6 -; VI-NEXT: v_or_b32_e32 v5, v5, v12 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc ; VI-NEXT: v_or_b32_e32 v6, v6, v11 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; VI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen ; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen ; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 @@ -1314,10 +1312,10 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -1325,31 +1323,29 @@ ; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 ; GFX9-NEXT: v_mul_hi_u32 v6, v2, s5 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v3, s5 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v9, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v4, s5 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v9, s4, v8 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v6, v2, v10 -; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7 +; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 ; GFX9-NEXT: v_sub_u32_e32 v7, 24, v2 -; GFX9-NEXT: v_sub_u32_e32 v10, 24, v3 +; GFX9-NEXT: v_sub_u32_e32 v10, 24, v4 ; GFX9-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, v3, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, v4, v9 ; GFX9-NEXT: v_and_b32_e32 v10, 0xffffff, v10 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshl_or_b32 v5, v5, v7, v6 +; GFX9-NEXT: v_lshl_or_b32 v3, v3, v7, v6 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshl_or_b32 v6, v8, v10, v9 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_lshl_or_b32 v5, v5, v10, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 ; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -830,9 +830,9 @@ ; GFX7-LABEL: notudot2_SameVec: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: s_load_dword s7, s[0:1], 0x0 @@ -2546,90 +2546,90 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX7-LABEL: udot2_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 16 -; GFX7-NEXT: s_lshr_b32 s7, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_and_b32 s5, s5, s8 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_and_b32 s1, s1, s8 +; GFX7-NEXT: s_and_b32 s0, s0, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s3, s2, s1 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2681,20 +2681,20 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s8, s6 ; GFX7-NEXT: s_mov_b32 s9, s7 +; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_mov_b32 s6, s2 ; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: s_mov_b32 s11, s3 -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 +; GFX7-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8 +; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v1, s4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -177,60 +177,60 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: idot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s6, s4 -; GFX7-NEXT: s_sext_i32_i8 s7, s5 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008 -; GFX7-NEXT: s_and_b32 s7, s7, s8 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s2, s0 +; GFX7-NEXT: s_sext_i32_i8 s3, s1 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s3, s8 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x80010 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x80008 ; GFX7-NEXT: s_and_b32 s10, s10, s8 -; GFX7-NEXT: s_and_b32 s6, s6, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010 -; GFX7-NEXT: s_ashr_i32 s5, s5, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x80010 +; GFX7-NEXT: s_ashr_i32 s1, s1, 24 ; GFX7-NEXT: s_and_b32 s12, s12, s8 ; GFX7-NEXT: s_and_b32 s9, s9, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: s_ashr_i32 s4, s4, 24 +; GFX7-NEXT: s_ashr_i32 s0, s0, 24 ; GFX7-NEXT: s_and_b32 s11, s11, s8 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_and_b32 s1, s1, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s0, s0, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i8 s3, s2 ; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: s_sext_i32_i8 s1, s0 ; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 @@ -248,20 +248,20 @@ ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 ; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 @@ -279,18 +279,18 @@ ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -352,114 +352,114 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_movk_i32 s1, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s2, s1 +; GFX7-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s1, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s1, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -894,45 +894,46 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: idot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i32 s6, s4, 24 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010 -; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008 -; GFX7-NEXT: s_ashr_i32 s9, s5, 24 -; GFX7-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 -; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s4, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_ashr_i32 s2, s0, 24 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x80010 +; GFX7-NEXT: s_bfe_i32 s11, s1, 0x80008 +; GFX7-NEXT: s_ashr_i32 s9, s1, 24 +; GFX7-NEXT: s_sext_i32_i8 s1, s1 +; GFX7-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX7-NEXT: s_bfe_i32 s8, s0, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s7, v3, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s3, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_i32_i24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s0 ; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -179,114 +179,114 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_movk_i32 s1, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s2, s1 +; GFX7-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s1, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_movk_i32 s3, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s1, s0, s3 +; GFX8-NEXT: s_and_b32 s3, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s3, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s3 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -349,114 +349,114 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_movk_i32 s1, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s2, s1 +; GFX7-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s1, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s1, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -512,97 +512,100 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot2_8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 -; GFX7-NEXT: s_and_b32 s6, s5, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80008 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s0, s8 +; GFX7-NEXT: s_and_b32 s2, s1, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s2, s0 -; GFX8-NEXT: s_and_b32 s0, s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s3, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s1, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s4, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s3, s0, s4 +; GFX9-NODL-NEXT: s_and_b32 s2, s1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s4, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s3, s0, s4 +; GFX9-DL-NEXT: s_and_b32 s2, s1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -653,114 +656,116 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s6, s4, s8 -; GFX7-NEXT: s_and_b32 s7, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s2, s0, s8 +; GFX7-NEXT: s_and_b32 s3, s1, s8 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s3, s1, s4 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s4, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s2, s0, s4 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -816,129 +821,132 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s6, s4, s8 -; GFX7-NEXT: s_and_b32 s7, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s2, s0, s8 +; GFX7-NEXT: s_and_b32 s3, s1, s8 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80010 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s3, s1, s4 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s4, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s2, s0, s4 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s4, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s2, s0, s4 +; GFX9-DL-NEXT: s_and_b32 s3, s1, s4 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-DL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1417,54 +1425,54 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: notdot4_mixedtypes: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s7, s6 -; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s5, s4 -; GFX7-NEXT: s_and_b32 s7, s7, s8 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s3, s2 +; GFX7-NEXT: s_bfe_u32 s9, s2, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s1, s0 +; GFX7-NEXT: s_and_b32 s3, s3, s8 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX7-NEXT: s_and_b32 s5, s5, s8 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s2, 0x80010 +; GFX7-NEXT: s_and_b32 s1, s1, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_sext_i32_i8 s3, s2 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX8-NEXT: s_sext_i32_i8 s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 @@ -1482,20 +1490,20 @@ ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 @@ -1513,20 +1521,20 @@ ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 @@ -1801,64 +1809,65 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s7, 0xff +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_movk_i32 s3, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008 -; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GFX7-NEXT: s_lshr_b32 s9, s6, 24 -; GFX7-NEXT: s_and_b32 s6, s6, s7 -; GFX7-NEXT: s_lshr_b32 s5, s4, 24 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s4, s4, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80008 +; GFX7-NEXT: s_bfe_u32 s12, s2, 0x80010 +; GFX7-NEXT: s_lshr_b32 s9, s2, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s3 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_and_b32 s0, s0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s0 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX8-NEXT: s_lshr_b32 s4, s2, 24 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s6, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s1, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s1 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2000,35 +2009,35 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: s_lshr_b32 s11, s5, 16 -; GFX7-NEXT: s_lshr_b32 s12, s5, 24 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX7-NEXT: s_lshr_b32 s11, s1, 16 +; GFX7-NEXT: s_lshr_b32 s12, s1, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_lshr_b32 s9, s4, 24 +; GFX7-NEXT: s_lshr_b32 s9, s0, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: s_mul_i32 s4, s4, s5 +; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: v_mul_u32_u24_e32 v1, s9, v1 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s7, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s6, v3 -; GFX7-NEXT: s_and_b32 s5, s4, s8 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s3, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s2, v3 +; GFX7-NEXT: s_and_b32 s1, s0, s8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, s5, v3 +; GFX7-NEXT: v_or_b32_e32 v2, s1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -2036,25 +2045,26 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_lshr_b32 s4, s1, 24 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -254,78 +254,79 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 -; GFX7-NEXT: s_and_b32 s7, s7, s8 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s3, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s3, s3, s8 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 ; GFX7-NEXT: s_and_b32 s10, s10, s8 -; GFX7-NEXT: s_and_b32 s6, s6, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c +; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c ; GFX7-NEXT: s_and_b32 s12, s12, s8 ; GFX7-NEXT: s_and_b32 s9, s9, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c -; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c +; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 ; GFX7-NEXT: s_and_b32 s14, s14, s8 ; GFX7-NEXT: s_and_b32 s11, s11, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 -; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 ; GFX7-NEXT: s_and_b32 s16, s16, s8 ; GFX7-NEXT: s_and_b32 s13, s13, s8 ; GFX7-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 -; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 ; GFX7-NEXT: s_and_b32 s18, s18, s8 ; GFX7-NEXT: s_and_b32 s15, s15, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 -; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 -; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 +; GFX7-NEXT: s_ashr_i32 s1, s1, 28 ; GFX7-NEXT: s_and_b32 s20, s20, s8 ; GFX7-NEXT: s_and_b32 s17, s17, s8 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: s_ashr_i32 s0, s0, 28 ; GFX7-NEXT: s_and_b32 s19, s19, s8 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_and_b32 s1, s1, s8 ; GFX7-NEXT: v_mov_b32_e32 v7, s20 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s0, s0, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 ; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 @@ -369,14 +370,15 @@ ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 ; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 @@ -420,14 +422,15 @@ ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 ; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 @@ -589,89 +592,89 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 -; GFX7-NEXT: s_and_b32 s7, s7, s8 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s3, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s3, s3, s8 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 ; GFX7-NEXT: s_and_b32 s10, s10, s8 -; GFX7-NEXT: s_and_b32 s6, s6, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c +; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c ; GFX7-NEXT: s_and_b32 s12, s12, s8 ; GFX7-NEXT: s_and_b32 s9, s9, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c -; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c +; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 ; GFX7-NEXT: s_and_b32 s14, s14, s8 ; GFX7-NEXT: s_and_b32 s11, s11, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 -; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 ; GFX7-NEXT: s_and_b32 s16, s16, s8 ; GFX7-NEXT: s_and_b32 s13, s13, s8 ; GFX7-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 -; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 ; GFX7-NEXT: s_and_b32 s18, s18, s8 ; GFX7-NEXT: s_and_b32 s15, s15, s8 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 -; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 -; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 +; GFX7-NEXT: s_ashr_i32 s1, s1, 28 ; GFX7-NEXT: s_and_b32 s20, s20, s8 ; GFX7-NEXT: s_and_b32 s17, s17, s8 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: s_ashr_i32 s0, s0, 28 ; GFX7-NEXT: s_and_b32 s19, s19, s8 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_and_b32 s1, s1, s8 ; GFX7-NEXT: v_mov_b32_e32 v7, s20 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s0, s0, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s3, 12 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s3, 0x40008 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 ; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_lshr_b32 s3, s2, 12 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s3 ; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 @@ -679,53 +682,53 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 ; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 ; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v9, s14 ; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: s_ashr_i32 s2, s2, 28 ; GFX8-NEXT: v_mov_b32_e32 v10, s16 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_movk_i32 s2, 0xff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s6, s3, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s3, 12 -; GFX9-NEXT: s_bfe_i32 s8, s3, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s3, 0x40008 +; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 ; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_lshr_b32 s3, s2, 12 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s3 ; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 @@ -733,53 +736,53 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 ; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 ; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v9, s14 ; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v6, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_movk_i32 s2, 0xff +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_movk_i32 s6, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 ; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s3 ; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 @@ -787,28 +790,28 @@ ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-DL-NEXT: s_bfe_i32 s12, s3, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 ; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 ; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 ; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm @@ -1580,14 +1583,14 @@ ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018 ; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014 @@ -1639,20 +1642,20 @@ ; ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s1, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s27, s3, 28 ; GFX8-NEXT: s_ashr_i64 s[16:17], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s15, s1, 28 ; GFX8-NEXT: s_lshl_b32 s19, s3, 8 ; GFX8-NEXT: s_lshl_b32 s21, s3, 12 -; GFX8-NEXT: s_lshl_b32 s15, s1, 28 ; GFX8-NEXT: s_lshl_b32 s23, s3, 16 ; GFX8-NEXT: s_lshl_b32 s25, s3, 24 ; GFX8-NEXT: s_lshl_b32 s17, s3, 4 @@ -1938,49 +1941,49 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff -; GFX7-NEXT: s_mov_b32 s9, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_mov_b32 s9, 0xffff +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40000 -; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40004 -; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40008 -; GFX7-NEXT: s_bfe_i32 s18, s5, 0x4000c -; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40010 -; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40014 -; GFX7-NEXT: s_bfe_i32 s21, s5, 0x40018 -; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s15, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40004 +; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40008 +; GFX7-NEXT: s_bfe_i32 s18, s1, 0x4000c +; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40010 +; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40014 +; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018 +; GFX7-NEXT: s_ashr_i32 s1, s1, 28 ; GFX7-NEXT: v_mov_b32_e32 v8, s15 -; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s3, s0, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v7, s16 -; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v6, s17 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v5, s18 -; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX7-NEXT: v_mov_b32_e32 v4, s19 -; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40014 +; GFX7-NEXT: s_bfe_i32 s13, s0, 0x40014 ; GFX7-NEXT: v_mov_b32_e32 v3, s20 -; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX7-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX7-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-NEXT: s_ashr_i32 s4, s4, 28 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: s_ashr_i32 s0, s0, 28 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s0, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 ; GFX7-NEXT: v_mul_i32_i24_e32 v9, s12, v4 ; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5 ; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6 -; GFX7-NEXT: v_mul_i32_i24_e32 v7, s7, v7 -; GFX7-NEXT: v_mul_i32_i24_e32 v8, s6, v8 +; GFX7-NEXT: v_mul_i32_i24_e32 v7, s3, v7 +; GFX7-NEXT: v_mul_i32_i24_e32 v8, s2, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 @@ -2014,20 +2017,21 @@ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_mov_b32 s33, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s1, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshl_b32 s11, s1, 24 ; GFX8-NEXT: s_lshl_b32 s15, s1, 16 @@ -2103,15 +2107,16 @@ ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s0, 4 ; GFX9-NEXT: s_lshr_b32 s14, s1, 4 @@ -2138,21 +2143,21 @@ ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s3, s0, 20 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s2, s0, 20 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s1, 20 ; GFX9-NEXT: s_lshr_b32 s11, s1, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s3 +; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s2 ; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11 ; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10 ; GFX9-NEXT: s_lshr_b32 s5, s0, 28 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_lshr_b32 s12, s1, 28 ; GFX9-NEXT: s_lshr_b32 s13, s1, 24 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 @@ -2174,7 +2179,7 @@ ; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 @@ -2191,15 +2196,16 @@ ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4 ; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4 @@ -2226,21 +2232,21 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 20 +; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20 ; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s3 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s2 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10 ; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28 ; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28 ; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 @@ -2262,7 +2268,7 @@ ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15 ; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX9-DL-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -254,32 +254,32 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 28 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 -; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_and_b32 s5, s5, 15 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 -; GFX7-NEXT: s_and_b32 s4, s4, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 @@ -287,28 +287,29 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -349,14 +350,15 @@ ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -397,14 +399,15 @@ ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -557,32 +560,32 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 28 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 -; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_and_b32 s5, s5, 15 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 -; GFX7-NEXT: s_and_b32 s4, s4, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 @@ -590,28 +593,29 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -652,14 +656,15 @@ ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -700,14 +705,15 @@ ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 ; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -860,32 +866,32 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 28 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 -; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_and_b32 s5, s5, 15 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 -; GFX7-NEXT: s_and_b32 s4, s4, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 @@ -893,29 +899,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: s_and_b32 s15, s1, 15 @@ -959,14 +966,15 @@ ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s8, s0, 15 ; GFX9-NEXT: s_and_b32 s15, s1, 15 @@ -1010,14 +1018,15 @@ ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 ; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 @@ -1160,32 +1169,32 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 28 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 -; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_and_b32 s5, s5, 15 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 -; GFX7-NEXT: s_and_b32 s4, s4, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 @@ -1193,29 +1202,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: s_and_b32 s15, s1, 15 @@ -1259,14 +1269,15 @@ ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s8, s0, 15 ; GFX9-NEXT: s_and_b32 s15, s1, 15 @@ -1310,14 +1321,15 @@ ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 ; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 @@ -1976,38 +1988,38 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40004 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40004 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v4, s17 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_and_b32 s18, s5, 15 -; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40010 +; GFX7-NEXT: s_and_b32 s18, s1, 15 +; GFX7-NEXT: s_lshr_b32 s13, s1, 28 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 -; GFX7-NEXT: s_lshr_b32 s6, s4, 28 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_and_b32 s11, s4, 15 +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX7-NEXT: s_and_b32 s11, s0, 15 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, s0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -2025,22 +2037,23 @@ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 ; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 @@ -2278,53 +2291,53 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40004 -; GFX7-NEXT: s_lshr_b32 s17, s5, 28 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s17, s1, 28 ; GFX7-NEXT: v_mov_b32_e32 v8, s13 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40008 -; GFX7-NEXT: s_and_b32 s16, s5, 15 -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX7-NEXT: s_and_b32 s16, s1, 15 +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v6, s15 -; GFX7-NEXT: s_lshr_b32 s10, s4, 28 +; GFX7-NEXT: s_lshr_b32 s10, s0, 28 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 ; GFX7-NEXT: v_mul_u32_u24_e32 v6, s8, v6 -; GFX7-NEXT: v_mul_u32_u24_e32 v8, s6, v8 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40008 +; GFX7-NEXT: v_mul_u32_u24_e32 v8, s2, v8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 -; GFX7-NEXT: s_and_b32 s9, s4, 15 +; GFX7-NEXT: s_and_b32 s9, s0, 15 ; GFX7-NEXT: v_mov_b32_e32 v5, s16 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40018 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40014 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_mul_u32_u24_e32 v5, s9, v5 -; GFX7-NEXT: v_mul_u32_u24_e32 v7, s7, v7 +; GFX7-NEXT: v_mul_u32_u24_e32 v7, s3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX7-NEXT: v_mul_u32_u24_e32 v9, s4, v1 +; GFX7-NEXT: v_mul_u32_u24_e32 v9, s0, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2342,38 +2355,39 @@ ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX8-NEXT: s_bfe_u32 s9, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s15, s2, 15 -; GFX8-NEXT: s_bfe_u32 s16, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s3, s1, 0x40014 -; GFX8-NEXT: s_lshr_b32 s5, s1, 28 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX8-NEXT: s_lshr_b32 s12, s2, 28 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s8, s1, 15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s2, s0, 0x40014 +; GFX8-NEXT: s_lshr_b32 s5, s0, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40010 +; GFX8-NEXT: s_lshr_b32 s12, s1, 28 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v4, s16 ; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: v_mov_b32_e32 v6, s15 @@ -2382,27 +2396,27 @@ ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v6 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40018 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s12 ; GFX8-NEXT: v_mov_b32_e32 v11, s5 ; GFX8-NEXT: v_mov_b32_e32 v12, s11 ; GFX8-NEXT: v_mov_b32_e32 v13, s10 -; GFX8-NEXT: v_mov_b32_e32 v14, s3 -; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v9 ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v9, s4, v12 +; GFX8-NEXT: v_mul_u32_u24_e32 v9, s3, v12 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s0, v9 +; GFX8-NEXT: v_and_b32_e32 v4, s4, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v6, v4, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 @@ -2421,17 +2435,18 @@ ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010 ; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 ; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 @@ -2441,7 +2456,7 @@ ; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v4, s11 ; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 ; GFX9-NEXT: v_mov_b32_e32 v5, s12 @@ -2455,8 +2470,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v9, s16 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v3, s2, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, s7, v7 @@ -2466,11 +2481,11 @@ ; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 ; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v6, v5, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 @@ -2487,17 +2502,18 @@ ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010 ; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 ; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 @@ -2507,7 +2523,7 @@ ; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 ; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12 @@ -2521,8 +2537,8 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s7, v7 @@ -2532,11 +2548,11 @@ ; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v5 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 @@ -2651,32 +2667,32 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 28 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 -; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 -; GFX7-NEXT: s_lshr_b32 s13, s5, 28 -; GFX7-NEXT: s_and_b32 s5, s5, 15 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 -; GFX7-NEXT: s_and_b32 s4, s4, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 28 +; GFX7-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s1, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s1, 28 +; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40004 +; GFX7-NEXT: s_and_b32 s0, s0, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 ; GFX7-NEXT: v_mov_b32_e32 v3, s18 ; GFX7-NEXT: v_mov_b32_e32 v4, s17 @@ -2684,29 +2700,30 @@ ; GFX7-NEXT: v_mov_b32_e32 v6, s15 ; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v7, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_and_b32 s8, s0, 15 ; GFX8-NEXT: s_and_b32 s15, s1, 15 @@ -2750,14 +2767,15 @@ ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s8, s0, 15 ; GFX9-NEXT: s_and_b32 s15, s1, 15 @@ -2801,14 +2819,15 @@ ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 ; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -205,7 +205,7 @@ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] ; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 -; GCN: s_and_b32 s3, s1, [[K]] +; GCN: s_and_b32 s{{[0-9]+}}, s1, [[K]] ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] ; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1617,36 +1617,36 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x10 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 ; SI-NEXT: s_add_u32 s0, s0, s7 ; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_mov_b32_e32 v16, 64 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: s_and_b32 s4, s4, 7 -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: v_mov_b32_e32 v6, s18 -; SI-NEXT: v_mov_b32_e32 v7, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s21 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: s_and_b32 s6, s6, 7 +; SI-NEXT: s_lshl_b32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 +; SI-NEXT: v_or_b32_e32 v16, s6, v16 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; SI-NEXT: v_or_b32_e32 v16, s4, v16 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen @@ -1654,47 +1654,47 @@ ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s7, 0x100f000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; VI-NEXT: s_add_u32 s0, s0, s7 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_and_b32 s4, s4, 7 -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_mov_b32_e32 v12, s24 -; VI-NEXT: v_mov_b32_e32 v13, s25 -; VI-NEXT: v_mov_b32_e32 v14, s26 -; VI-NEXT: v_mov_b32_e32 v15, s27 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v5, s17 -; VI-NEXT: v_mov_b32_e32 v6, s18 -; VI-NEXT: v_mov_b32_e32 v7, s19 -; VI-NEXT: v_mov_b32_e32 v8, s20 -; VI-NEXT: v_mov_b32_e32 v9, s21 -; VI-NEXT: v_mov_b32_e32 v10, s22 -; VI-NEXT: v_mov_b32_e32 v11, s23 +; VI-NEXT: s_and_b32 s6, s6, 7 +; VI-NEXT: s_lshl_b32 s6, s6, 3 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v12, s20 +; VI-NEXT: v_mov_b32_e32 v13, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 +; VI-NEXT: v_or_b32_e32 v16, s6, v16 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_mov_b32_e32 v9, s17 +; VI-NEXT: v_mov_b32_e32 v10, s18 +; VI-NEXT: v_mov_b32_e32 v11, s19 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; VI-NEXT: v_or_b32_e32 v16, s4, v16 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen @@ -1702,13 +1702,13 @@ ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s7, 0x1100f000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:48 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <8 x double> %a, double 8.0, i32 %b store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -131,14 +131,14 @@ ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: s_load_dword s1, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: s_and_b32 s1, s1, 0xffff ; CI-NEXT: s_lshl_b32 s2, s0, 16 ; CI-NEXT: s_or_b32 s1, s1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s1 @@ -1102,14 +1102,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 @@ -1120,14 +1120,14 @@ ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 -; VI-NEXT: s_load_dword s1, s[2:3], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_load_dword s1, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s0, s0, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 @@ -1138,14 +1138,14 @@ ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 -; CI-NEXT: s_load_dword s1, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 +; CI-NEXT: s_load_dword s1, s[6:7], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b32 s0, s0, 4 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 @@ -1683,14 +1683,14 @@ ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: s_mov_b32 s1, 0 @@ -1699,7 +1699,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1 ; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0 @@ -1804,26 +1805,27 @@ ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_mov_b32 s0, 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_mov_b32 s1, 0 -; VI-NEXT: s_lshl_b32 s2, s5, 4 -; VI-NEXT: s_and_b32 s3, s4, s0 -; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; VI-NEXT: s_lshl_b32 s2, s3, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: s_and_b32 s2, s2, s0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 +; VI-NEXT: s_lshl_b32 s3, s2, 16 +; VI-NEXT: s_or_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bfi_b32 v1, s1, v4, v1 ; VI-NEXT: v_bfi_b32 v0, s0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1832,26 +1834,27 @@ ; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_and_b32 s2, s4, s0 -; CI-NEXT: s_lshl_b32 s4, s4, 16 ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_or_b32 s2, s2, s4 ; CI-NEXT: s_mov_b32 s1, 0 -; CI-NEXT: s_lshl_b32 s3, s5, 4 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_and_b32 s4, s2, s0 +; CI-NEXT: s_lshl_b32 s2, s2, 16 +; CI-NEXT: s_or_b32 s2, s4, s2 +; CI-NEXT: s_lshl_b32 s3, s3, 4 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_bfi_b32 v1, s1, v4, v1 ; CI-NEXT: v_bfi_b32 v0, s0, v5, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -237,29 +237,29 @@ ; SI-LABEL: maxnum_v2f16: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s6, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_lshr_b32 s0, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_max_f32_e32 v2, v3, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -260,29 +260,29 @@ ; SI-LABEL: minnum_v2f16_ieee: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s6, s[6:7], 0x0 ; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 ; SI-NEXT: s_lshr_b32 s0, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v3, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -142,15 +142,15 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s7, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s14, s0, s7 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14 ; SI-NEXT: s_brev_b32 s15, 1 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] @@ -237,15 +237,15 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s19, s0, s18 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 ; SI-NEXT: s_brev_b32 s20, 1 ; SI-NEXT: s_andn2_b64 s[16:17], s[10:11], s[0:1] @@ -342,47 +342,47 @@ ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s12, -2 ; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v4, s9 +; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v4, s5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[14:15] -; CI-NEXT: v_mov_b32_e32 v10, s15 -; CI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; CI-NEXT: v_bfi_b32 v10, s2, v12, v10 +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] +; CI-NEXT: v_mov_b32_e32 v10, s11 +; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] +; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v6, 0 ; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] ; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v13, s13 +; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v13, s9 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v12, s2, v12, v13 +; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] +; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out @@ -588,12 +588,11 @@ ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] ; CI-NEXT: v_mov_b32_e32 v4, s11 @@ -652,6 +651,7 @@ ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 ; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -10,12 +10,8 @@ ; GCN-LABEL: {{^}}madak_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 @@ -101,12 +97,8 @@ ; GCN-LABEL: {{^}}madak_inline_imm_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -74,20 +74,20 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off ; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 ; GCN-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 ; GCN-NEXT: s_endpgm bb: @@ -118,6 +118,7 @@ ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:16 ; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20 ; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24 ; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28 @@ -132,22 +133,20 @@ ; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 ; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 -; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12 -; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12 +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:16 ; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20 ; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24 ; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28 @@ -196,19 +195,18 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v11, s5 +; GCN-NEXT: v_mov_b32_e32 v10, s4 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 -; GCN-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NEXT: v_mov_b32_e32 v8, s4 +; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[10:11], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[10:11], v[4:7], off offset:16 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll --- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll +++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll @@ -529,8 +529,8 @@ ; GCN-LABEL: {{^}}merge_global_store_5_constants_i32: ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}} ; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}} -; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} -; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}} ; GCN: buffer_store_dword v[[HI]] define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) { store i32 9, i32 addrspace(1)* %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -15,13 +15,13 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -86,6 +86,7 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -95,20 +96,19 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -224,15 +224,15 @@ ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -299,9 +299,9 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -454,10 +454,10 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -520,10 +520,10 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) #2 %conv = and i64 %call, 255 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -854,16 +854,16 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s0, s14 ; GCN-NEXT: s_mov_b32 s1, s15 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_mov_b32 s14, 0x4f800000 ; GCN-NEXT: s_mov_b32 s8, s12 ; GCN-NEXT: s_mov_b32 s9, s13 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_ashrrev_i32_e32 v9, 31, v4 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GCN-NEXT: v_xor_b32_e32 v4, v4, v9 ; GCN-NEXT: v_xor_b32_e32 v15, v8, v9 ; GCN-NEXT: v_cvt_f32_u32_e32 v9, v4 @@ -1002,16 +1002,16 @@ ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_mov_b32 s0, s14 ; TONGA-NEXT: s_mov_b32 s1, s15 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; TONGA-NEXT: s_mov_b32 s14, 0x4f800000 ; TONGA-NEXT: s_mov_b32 s8, s12 ; TONGA-NEXT: s_mov_b32 s9, s13 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 -; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_ashrrev_i32_e32 v9, 31, v4 ; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; TONGA-NEXT: v_xor_b32_e32 v4, v4, v9 ; TONGA-NEXT: v_xor_b32_e32 v15, v8, v9 ; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v4 @@ -1150,15 +1150,15 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s0, s10 ; GFX9-NEXT: s_mov_b32 s1, s11 -; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_mov_b32 s4, 0x4f800000 ; GFX9-NEXT: s_mov_b32 s12, s8 ; GFX9-NEXT: s_mov_b32 s13, s9 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0 ; GFX9-NEXT: v_add_u32_e32 v4, v4, v9 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v8 ; GFX9-NEXT: v_xor_b32_e32 v4, v4, v9 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -6,7 +6,7 @@ ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 @@ -26,76 +26,76 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s15, s14 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v3 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v8, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[0:1] -; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_mul_lo_u32 v7, s5, v0 ; GCN-NEXT: v_mul_lo_u32 v6, s4, v0 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 +; GCN-NEXT: v_mul_lo_u32 v7, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v11, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v3, v6 -; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_addc_u32_e64 v4, vcc, v3, v5, s[0:1] +; GCN-NEXT: v_mul_lo_u32 v6, s4, v4 +; GCN-NEXT: v_mul_hi_u32 v7, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_mul_lo_u32 v7, s4, v0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GCN-NEXT: v_mul_lo_u32 v10, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 +; GCN-NEXT: v_mul_hi_u32 v9, v4, v7 +; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v8, v4, v6 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v7, v12, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[0:1] +; GCN-NEXT: v_addc_u32_e32 v11, vcc, v2, v12, vcc +; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: s_add_u32 s0, s10, s14 ; GCN-NEXT: s_addc_u32 s1, s11, s14 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s10, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v6, s10, v3 +; GCN-NEXT: v_mul_hi_u32 v7, s11, v3 +; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v2, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, s11, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 ; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 @@ -497,14 +497,14 @@ define amdgpu_kernel void @s_test_sdiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv24_64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -527,14 +527,14 @@ ; ; GCN-IR-LABEL: s_test_sdiv24_64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 40 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s0, s4 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 40 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -669,14 +669,14 @@ define amdgpu_kernel void @s_test_sdiv31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv31_64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 33 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -699,14 +699,14 @@ ; ; GCN-IR-LABEL: s_test_sdiv31_64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 33 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 33 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s0, s4 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 33 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -736,14 +736,14 @@ define amdgpu_kernel void @s_test_sdiv23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv23_64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -766,14 +766,14 @@ ; ; GCN-IR-LABEL: s_test_sdiv23_64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 41 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s0, s4 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 41 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -803,14 +803,14 @@ define amdgpu_kernel void @s_test_sdiv25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv25_64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s0, s4 -; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-NEXT: s_ashr_i64 s[8:9], s[8:9], 39 ; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4 @@ -833,14 +833,14 @@ ; ; GCN-IR-LABEL: s_test_sdiv25_64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s9, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s0, s4 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 39 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[8:9], 39 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_mov_b32 s0, s4 ; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[6:7], 39 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x @@ -14,7 +14,7 @@ } ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x @@ -27,7 +27,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x @@ -40,7 +40,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], {{[^,]*}}, 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -96,8 +96,8 @@ define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_valu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: v_cvt_f32_u32_e32 v0, v0 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xc ; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: v_cmp_lg_f32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -478,30 +478,31 @@ define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem23_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 41 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 41 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_xor_b32 s0, s6, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -510,30 +511,31 @@ ; ; GCN-IR-LABEL: s_test_srem23_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 41 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 41 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 41 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_xor_b32 s0, s6, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 23 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -549,30 +551,31 @@ define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem24_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_xor_b32 s0, s6, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -581,30 +584,31 @@ ; ; GCN-IR-LABEL: s_test_srem24_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 40 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 40 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_xor_b32 s0, s6, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -674,30 +678,31 @@ define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem25_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 39 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 39 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_xor_b32 s0, s6, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -706,30 +711,31 @@ ; ; GCN-IR-LABEL: s_test_srem25_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 39 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 39 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 39 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_xor_b32 s0, s6, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -745,30 +751,31 @@ define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem31_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i64 s[8:9], s[4:5], 33 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 33 -; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-NEXT: s_xor_b32 s1, s6, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_or_b32 s1, s1, 1 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_xor_b32 s0, s6, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s0, s0, 1 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_mov_b32_e32 v3, s0 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -777,30 +784,31 @@ ; ; GCN-IR-LABEL: s_test_srem31_64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s1, s[0:1], 0xe +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 33 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 +; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 33 -; GCN-IR-NEXT: s_ashr_i64 s[0:1], s[0:1], 33 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s6 -; GCN-IR-NEXT: s_xor_b32 s1, s6, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_xor_b32 s0, s6, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 @@ -817,28 +825,28 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem32_64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-NEXT: s_xor_b32 s1, s7, s0 -; GCN-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-NEXT: s_or_b32 s1, s1, 1 +; GCN-NEXT: s_xor_b32 s0, s7, s8 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_or_b32 s0, s0, 1 +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -846,28 +854,28 @@ ; ; GCN-IR-LABEL: s_test_srem32_64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s8, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s8 ; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s7 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 -; GCN-IR-NEXT: s_xor_b32 s1, s7, s0 -; GCN-IR-NEXT: s_ashr_i32 s1, s1, 30 -; GCN-IR-NEXT: s_or_b32 s1, s1, 1 +; GCN-IR-NEXT: s_xor_b32 s0, s7, s8 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v3, s1 -; GCN-IR-NEXT: s_mov_b32 s1, s5 +; GCN-IR-NEXT: s_or_b32 s0, s0, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s0 +; GCN-IR-NEXT: s_mov_b32 s0, s4 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| ; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s8 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s7, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -86,9 +86,9 @@ ; SI-LABEL: truncate_high_elt_extract_vector: ; SI: ; %bb.0: ; %bb ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s4, s[4:5], 0x0 ; SI-NEXT: s_load_dword s5, s[6:7], 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -434,18 +434,18 @@ define amdgpu_kernel void @s_test_udiv24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv24_64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_lshr_b32 s0, s7, 8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -459,18 +459,18 @@ ; ; GCN-IR-LABEL: s_test_udiv24_64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 8 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 8 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_lshr_b32 s0, s7, 8 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -534,16 +534,15 @@ ; GCN-LABEL: s_test_udiv32_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s4 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -551,22 +550,21 @@ ; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_udiv32_i64: ; GCN-IR: ; %bb.0: ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: s_mov_b32 s0, s4 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -574,7 +572,7 @@ ; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i64 %x, 32 %2 = lshr i64 %y, 32 @@ -586,18 +584,18 @@ define amdgpu_kernel void @s_test_udiv31_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv31_i64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_lshr_b32 s2, s2, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_lshr_b32 s0, s7, 1 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -611,18 +609,18 @@ ; ; GCN-IR-LABEL: s_test_udiv31_i64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_lshr_b32 s0, s7, 1 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -643,18 +641,18 @@ define amdgpu_kernel void @s_test_udiv23_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_udiv23_i64: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s1, s5 -; GCN-NEXT: s_lshr_b32 s0, s0, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-NEXT: s_lshr_b32 s2, s2, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: s_lshr_b32 s0, s7, 9 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 @@ -668,18 +666,18 @@ ; ; GCN-IR-LABEL: s_test_udiv23_i64: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s0, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s1, s5 -; GCN-IR-NEXT: s_lshr_b32 s0, s0, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GCN-IR-NEXT: s_lshr_b32 s2, s2, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-IR-NEXT: s_lshr_b32 s0, s7, 9 ; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_mov_b32 s0, s4 +; GCN-IR-NEXT: s_mov_b32 s1, s5 ; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -508,86 +508,86 @@ define amdgpu_kernel void @s_test_urem31_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem31_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_lshr_b32 s3, s3, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 +; GCN-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-NEXT: s_lshr_b32 s6, s7, 1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-NEXT: s_lshr_b32 s7, s11, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: v_mad_f32 v2, -v2, v5, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: v_and_b32_e32 v0, s4, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_and_b32_e32 v2, s4, v2 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem31_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-IR-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_lshr_b32 s3, s3, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-IR-NEXT: s_lshr_b32 s6, s7, 1 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-IR-NEXT: s_lshr_b32 s7, s11, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GCN-IR-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v5 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: s_brev_b32 s0, -2 -; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v5, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-IR-NEXT: s_brev_b32 s4, -2 +; GCN-IR-NEXT: v_and_b32_e32 v0, s4, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, s4, v2 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, @@ -658,86 +658,86 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { ; GCN-LABEL: s_test_urem23_64_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-NEXT: s_lshr_b32 s3, s3, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, s3 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s1 -; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v2 +; GCN-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-NEXT: s_lshr_b32 s6, s7, 9 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GCN-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v5 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: s_brev_b32 s0, -2 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-NEXT: v_mad_f32 v2, -v2, v5, v4 +; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-NEXT: s_brev_b32 s4, -2 +; GCN-NEXT: v_and_b32_e32 v0, s4, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-NEXT: v_and_b32_e32 v2, s4, v2 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem23_64_v2i64: ; GCN-IR: ; %bb.0: -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd -; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x11 +; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_lshr_b32 s2, s9, 1 -; GCN-IR-NEXT: s_lshr_b32 s0, s1, 1 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 -; GCN-IR-NEXT: s_lshr_b32 s3, s3, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s3 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GCN-IR-NEXT: s_lshr_b32 s1, s11, 9 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v3, s1 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 -; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v2 +; GCN-IR-NEXT: s_lshr_b32 s4, s9, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_lshr_b32 s5, s5, 1 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GCN-IR-NEXT: s_lshr_b32 s6, s7, 9 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GCN-IR-NEXT: s_lshr_b32 s7, s11, 9 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v5, s7 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v4, s6 +; GCN-IR-NEXT: v_mul_f32_e32 v3, v2, v3 +; GCN-IR-NEXT: v_trunc_f32_e32 v3, v3 +; GCN-IR-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v5 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v4, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cvt_u32_f32_e32 v5, v2 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: v_mad_f32 v2, -v2, v4, v3 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 -; GCN-IR-NEXT: s_brev_b32 s0, -2 -; GCN-IR-NEXT: v_and_b32_e32 v0, s0, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GCN-IR-NEXT: v_and_b32_e32 v2, s0, v2 +; GCN-IR-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 +; GCN-IR-NEXT: v_mad_f32 v2, -v2, v5, v4 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, s7 +; GCN-IR-NEXT: s_brev_b32 s4, -2 +; GCN-IR-NEXT: v_and_b32_e32 v0, s4, v0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v1 -; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, s4, v2 +; GCN-IR-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr <2 x i64> %x, %2 = lshr <2 x i64> %y, diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -147,11 +147,11 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { %tmp17 = shl i32 %index, 5 -; GFX9: buffer_load_dwordx4 +; GFX9-DAG: buffer_load_dwordx4 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> %tmp19 = or i32 %tmp17, 16 -; GFX9: buffer_load_dwordx2 +; GFX9-DAG: buffer_load_dwordx2 %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)