diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -423,7 +423,7 @@ const MachineFunction &MF = *MI1.getParent()->getParent(); const DataLayout &DL = MF.getFunction().getParent()->getDataLayout(); Base1 = GetUnderlyingObject(Base1, DL); - Base2 = GetUnderlyingObject(Base1, DL); + Base2 = GetUnderlyingObject(Base2, DL); if (isa(Base1) || isa(Base2)) return false; diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll @@ -105,7 +105,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -125,7 +125,7 @@ ; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll @@ -284,7 +284,7 @@ ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] ; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] -; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -285,7 +285,7 @@ ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] ; VI: flat_load_ushort [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] ; VI: flat_load_ushort [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] -; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]] ; GCN: buffer_store_short [[RESULT]], ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -62,8 +62,8 @@ ; GCN-LABEL: {{^}}fadd_v2f16: ; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; VI: flat_load_dword v[[B_V2_F16:[0-9]+]] ; VI: flat_load_dword v[[A_V2_F16:[0-9]+]] +; VI: flat_load_dword v[[B_V2_F16:[0-9]+]] ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -83,8 +83,8 @@ ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 ; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 +; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -20,15 +20,16 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 ; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -88,14 +89,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -105,14 +106,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -154,17 +155,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 -; GFX7-NEXT: s_and_b32 s5, s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 +; GFX7-NEXT: s_and_b32 s5, s5, s8 ; GFX7-NEXT: v_mul_u32_u24_e32 v0, s5, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s6, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -250,16 +252,16 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s2, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -318,18 +320,18 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -341,18 +343,18 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -362,14 +364,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -379,14 +381,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s4, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -446,18 +448,18 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -469,18 +471,18 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -492,18 +494,18 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -516,16 +518,16 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -564,15 +566,16 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 ; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -632,14 +635,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -649,14 +652,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -716,18 +719,18 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 0xffff +; GFX8-NEXT: s_and_b32 s6, s3, 0xffff ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -739,18 +742,18 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -762,18 +765,18 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -786,16 +789,16 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -831,13 +834,13 @@ ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s5, s6, 16 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: s_and_b32 s4, s4, 0xffff -; GFX7-NEXT: s_lshr_b32 s5, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, s5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, s4, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -848,17 +851,17 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s1, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 +; GFX8-NEXT: s_lshr_b32 s0, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_and_b32 s1, s4, 0xffff ; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -867,17 +870,17 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: s_and_b32 s1, s4, 0xffff ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -886,17 +889,17 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 +; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: s_and_b32 s1, s4, 0xffff ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -906,16 +909,16 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s1, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -954,17 +957,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 +; GFX7-NEXT: s_and_b32 s6, s4, s8 +; GFX7-NEXT: s_and_b32 s7, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 -; GFX7-NEXT: s_and_b32 s8, s5, s8 ; GFX7-NEXT: s_lshr_b32 s5, s5, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1022,14 +1026,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1039,14 +1043,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1085,17 +1089,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x1 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 +; GFX7-NEXT: s_and_b32 s6, s4, s8 +; GFX7-NEXT: s_and_b32 s7, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX7-NEXT: s_lshr_b32 s4, s4, 16 -; GFX7-NEXT: s_and_b32 s8, s5, s8 ; GFX7-NEXT: s_lshr_b32 s5, s5, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1153,14 +1158,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x4 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1170,14 +1175,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x4 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1216,15 +1221,16 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_and_b32 s5, s5, s8 ; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s6, s6, s8 ; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 -; GFX7-NEXT: s_and_b32 s6, s6, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1312,16 +1318,16 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX10-DL-NEXT: s_and_b32 s3, s4, s8 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: s_and_b32 s4, s4, s8 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1360,15 +1366,16 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_and_b32 s5, s5, s8 -; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7-NEXT: s_lshr_b32 s4, s4, 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 ; GFX7-NEXT: s_lshr_b32 s6, s6, 16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1456,16 +1463,16 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1504,17 +1511,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_and_b32 s5, s5, s8 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1600,16 +1608,16 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: s_and_b32 s7, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1648,15 +1656,16 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 ; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1748,17 +1757,17 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1821,19 +1830,19 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1845,19 +1854,19 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1869,19 +1878,19 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1894,17 +1903,17 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s3, s2, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1945,17 +1954,18 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 ; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mad_u32_u24 v1, s5, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s7 -; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s7, v2, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2045,17 +2055,17 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2119,19 +2129,19 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2143,19 +2153,19 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2167,19 +2177,19 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2192,17 +2202,17 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i16 s6, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2244,16 +2254,17 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: s_lshr_b32 s9, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v1, s9, v0, v1 +; GFX7-NEXT: s_lshr_b32 s6, s4, 16 +; GFX7-NEXT: s_lshr_b32 s7, s5, 16 ; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 ; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v1, s7, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2344,17 +2355,17 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2418,19 +2429,19 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2442,19 +2453,19 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2466,19 +2477,19 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2491,17 +2502,17 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2561,23 +2572,23 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s2 +; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s0, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2585,23 +2596,23 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2610,15 +2621,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2699,19 +2710,19 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2726,20 +2737,20 @@ ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NODL-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2754,20 +2765,20 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-DL-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -16,22 +16,22 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s7, s4 -; GFX7-NEXT: s_sext_i32_i8 s8, s5 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010 -; GFX7-NEXT: v_mad_i32_i24 v0, s7, v0, v1 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0 +; GFX7-NEXT: s_sext_i32_i8 s6, s4 +; GFX7-NEXT: s_sext_i32_i8 s7, s5 +; GFX7-NEXT: s_bfe_i32 s9, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x80010 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-NEXT: s_ashr_i32 s5, s5, 24 -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX7-NEXT: s_ashr_i32 s4, s4, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 @@ -45,27 +45,27 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_sext_i32_i8 s4, s2 +; GFX8-NEXT: s_sext_i32_i8 s5, s3 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -76,27 +76,27 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -105,15 +105,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -123,14 +123,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s3, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s2, s3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -220,29 +220,29 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 +; GFX8-NEXT: s_sext_i32_i8 s1, s2 +; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s4, s3, 0x80008 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s1, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s3, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -251,29 +251,29 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 +; GFX9-NODL-NEXT: s_bfe_i32 s3, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s3, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80010 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -282,15 +282,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -356,28 +356,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_and_b32 s6, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_and_b32 s7, s6, s5 +; GFX7-NEXT: s_and_b32 s5, s4, s5 +; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -386,31 +386,31 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX8-NEXT: s_and_b32 s3, s1, s2 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s0, s2, s0 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s2, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -418,31 +418,31 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -451,15 +451,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -520,23 +520,23 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s7, s4 -; GFX7-NEXT: s_sext_i32_i8 s8, s5 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 -; GFX7-NEXT: v_mad_i32_i24 v1, s7, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010 -; GFX7-NEXT: v_mad_i32_i24 v1, s9, v2, v1 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010 -; GFX7-NEXT: v_mad_i32_i24 v0, s7, v0, v1 +; GFX7-NEXT: s_sext_i32_i8 s6, s4 +; GFX7-NEXT: s_sext_i32_i8 s7, s5 +; GFX7-NEXT: s_bfe_i32 s9, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 +; GFX7-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010 +; GFX7-NEXT: v_mad_i32_i24 v1, s8, v2, v1 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x80010 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-NEXT: s_ashr_i32 s5, s5, 24 -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX7-NEXT: s_ashr_i32 s4, s4, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 @@ -550,28 +550,28 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_sext_i32_i8 s4, s2 +; GFX8-NEXT: s_sext_i32_i8 s5, s3 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -582,28 +582,28 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -614,28 +614,28 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s3 +; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80008 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x80010 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v1 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -648,23 +648,23 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -719,25 +719,25 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i32 s7, s4, 24 -; GFX7-NEXT: s_ashr_i32 s10, s5, 24 -; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80008 +; GFX7-NEXT: s_ashr_i32 s6, s4, 24 +; GFX7-NEXT: s_ashr_i32 s9, s5, 24 +; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008 ; GFX7-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80010 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 +; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 ; GFX7-NEXT: s_sext_i32_i8 s4, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s10 ; GFX7-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -748,28 +748,28 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX8-NEXT: s_ashr_i32 s5, s3, 24 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX8-NEXT: s_ashr_i32 s6, s3, 24 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NEXT: s_ashr_i32 s0, s2, 24 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX8-NEXT: s_ashr_i32 s4, s2, 24 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -780,28 +780,28 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s5, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-NODL-NEXT: s_ashr_i32 s6, s3, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s3, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s4, s2, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -812,28 +812,28 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s5, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-DL-NEXT: s_ashr_i32 s6, s3, 24 +; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX9-DL-NEXT: s_ashr_i32 s4, s2, 24 +; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -846,24 +846,24 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: v_mad_i32_i24 v4, s0, s1, v4 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -927,33 +927,33 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80000 -; GFX8-NEXT: s_lshr_b32 s4, s2, 16 -; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s1 -; GFX8-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s3 -; GFX8-NEXT: s_and_b32 s3, s0, s6 -; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s2 -; GFX8-NEXT: s_bfe_i32 s2, s4, 0x80000 +; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80000 +; GFX8-NEXT: s_lshr_b32 s4, s3, 16 +; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s3 +; GFX8-NEXT: s_bfe_i32 s3, s4, 0x80000 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80000 +; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s0 +; GFX8-NEXT: s_bfe_i32 s0, s1, 0x80000 +; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s6 ; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4 -; GFX8-NEXT: s_and_b32 s4, s0, s5 -; GFX8-NEXT: v_mov_b32_e32 v7, s3 -; GFX8-NEXT: s_and_b32 s2, s0, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s4, s2, s5 +; GFX8-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-NEXT: s_and_b32 s3, s2, s3 +; GFX8-NEXT: s_and_b32 s0, s2, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -1046,31 +1046,31 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v2 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s3 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s4 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 8, s2 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s5 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s5 +; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v11, 16, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -17,22 +17,22 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 -; GFX7-NEXT: s_and_b32 s8, s5, s8 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: s_and_b32 s6, s4, s8 +; GFX7-NEXT: s_and_b32 s7, s5, s8 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-NEXT: s_lshr_b32 s5, s5, 24 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 @@ -47,22 +47,22 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 +; GFX8-NEXT: s_and_b32 s5, s3, s2 ; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_lshr_b32 s4, s4, 24 -; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s3, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 @@ -79,22 +79,22 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 +; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 @@ -108,15 +108,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -126,14 +126,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s3, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s2, s3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -183,28 +183,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_and_b32 s6, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_and_b32 s7, s6, s5 +; GFX7-NEXT: s_and_b32 s5, s4, s5 +; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -214,20 +214,20 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_and_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 @@ -246,20 +246,20 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 ; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 @@ -278,15 +278,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -353,28 +353,28 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 ; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_and_b32 s6, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_and_b32 s7, s6, s5 +; GFX7-NEXT: s_and_b32 s5, s4, s5 +; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -383,31 +383,31 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX8-NEXT: s_and_b32 s3, s1, s2 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: s_and_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s0, s2, s0 ; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s2, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_lshr_b32 s0, s0, 24 +; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -415,31 +415,31 @@ ; GFX9-NODL: ; %bb.0: ; %entry ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 +; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 ; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -448,15 +448,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -613,19 +613,19 @@ ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -752,14 +752,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -948,24 +948,24 @@ ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1012,23 +1012,23 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 -; GFX7-NEXT: s_and_b32 s8, s5, s8 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: v_mad_u32_u24 v1, s7, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 -; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 +; GFX7-NEXT: s_and_b32 s6, s4, s8 +; GFX7-NEXT: s_and_b32 s7, s5, s8 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX7-NEXT: v_mad_u32_u24 v1, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-NEXT: s_lshr_b32 s5, s5, 24 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 @@ -1043,23 +1043,23 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 +; GFX8-NEXT: s_and_b32 s5, s3, s2 ; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX8-NEXT: v_mad_u32_u24 v1, s6, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_lshr_b32 s4, s4, 24 -; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX8-NEXT: s_lshr_b32 s3, s3, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 @@ -1076,23 +1076,23 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 +; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v0, v1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NODL-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 @@ -1109,23 +1109,23 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 -; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0 @@ -1144,23 +1144,23 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 +; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1215,23 +1215,23 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s12, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: s_and_b32 s8, s5, s8 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1 -; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-NEXT: s_and_b32 s6, s4, s8 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s7, s5, s8 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v0, s9 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: s_lshr_b32 s5, s5, 24 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v2, v0 @@ -1247,23 +1247,23 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s3, s2 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX8-NEXT: s_and_b32 s5, s3, s2 +; GFX8-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mad_u32_u24 v0, s7, v0, v1 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s5, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, s6, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s10, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: s_lshr_b32 s4, s4, 24 -; GFX8-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v2, v0 ; GFX8-NEXT: s_lshr_b32 s3, s3, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mad_u32_u24 v0, s3, v2, v0 @@ -1281,23 +1281,23 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX9-NODL-NEXT: s_and_b32 s5, s3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v0, v1 -; GFX9-NODL-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v0 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v2, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, s10, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 @@ -1315,23 +1315,23 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s10, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80008 +; GFX9-DL-NEXT: s_and_b32 s5, s3, s2 +; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v0, v1 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010 -; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v0 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: v_add_u32_e32 v1, s10, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v2, v0 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0 @@ -1351,24 +1351,24 @@ ; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 +; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-DL-NEXT: s_and_b32 s8, s3, s2 ; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s6, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s2, v0 ; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 ; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1421,28 +1421,28 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s6, s4 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s7, s5 -; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s7, s6 +; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s5, s4 ; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 -; GFX7-NEXT: s_and_b32 s6, s6, s8 +; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010 +; GFX7-NEXT: s_and_b32 s5, s5, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s6, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: s_lshr_b32 s4, s4, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1452,29 +1452,29 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s3, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_sext_i32_i8 s4, s3 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX8-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_sext_i32_i8 s4, s0 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1483,29 +1483,29 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-NODL-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1514,29 +1514,29 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1546,24 +1546,24 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1613,31 +1613,32 @@ ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_movk_i32 s12, 0xff +; GFX7-NEXT: s_movk_i32 s11, 0xff ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s7, s4, 24 -; GFX7-NEXT: s_lshr_b32 s9, s5, 24 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: s_bfe_u32 s13, s5, 0x80010 -; GFX7-NEXT: s_and_b32 s5, s5, s12 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s4, s4, s12 +; GFX7-NEXT: s_lshr_b32 s6, s4, 24 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 +; GFX7-NEXT: s_lshr_b32 s8, s5, 24 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 +; GFX7-NEXT: s_and_b32 s5, s5, s11 +; GFX7-NEXT: s_and_b32 s4, s4, s11 +; GFX7-NEXT: s_load_dword s11, s[0:1], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s11 ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1649,24 +1650,25 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s6, s3, 24 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX8-NEXT: s_lshr_b32 s5, s3, 24 +; GFX8-NEXT: s_lshr_b32 s6, s4, 24 +; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80010 ; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3 ; GFX8-NEXT: s_and_b32 s3, s3, s2 ; GFX8-NEXT: s_and_b32 s2, s4, s2 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80010 ; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: v_mad_u32_u24 v2, s3, v2, v3 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_lshr_b32 s7, s4, 24 -; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1680,24 +1682,25 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24 -; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s6, s4, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80010 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 ; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2 ; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80010 ; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 24 -; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off @@ -1711,24 +1714,25 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24 -; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 24 +; GFX9-DL-NEXT: s_lshr_b32 s6, s4, 24 +; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3 ; GFX9-DL-NEXT: s_and_b32 s3, s3, s2 ; GFX9-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80010 ; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4 +; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3 ; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 24 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1745,24 +1749,24 @@ ; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_and_b32 s7, s4, s3 +; GFX10-DL-NEXT: s_and_b32 s3, s5, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-DL-NEXT: s_and_b32 s0, s4, s3 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v4, s0, s1, v4 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v2, v3, v4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1798,29 +1802,29 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, -1 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_movk_i32 s7, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 24 -; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010 -; GFX7-NEXT: s_lshr_b32 s9, s5, 24 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008 +; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010 +; GFX7-NEXT: s_lshr_b32 s9, s6, 24 +; GFX7-NEXT: s_and_b32 s6, s6, s7 +; GFX7-NEXT: s_lshr_b32 s5, s4, 24 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 ; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_and_b32 s4, s4, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s6 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1934,27 +1938,27 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s3 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s3, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, s2, 16, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 @@ -2081,25 +2085,25 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s3, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s3, s2, 24 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 ; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4 @@ -2117,25 +2121,25 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16 +; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 24 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 @@ -2154,24 +2158,24 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s0 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -16,38 +16,38 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v0, s9 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1 -; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008 -; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c -; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 +; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0 +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c +; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0 +; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 +; GFX7-NEXT: v_mad_i32_i24 v0, s13, v1, v0 +; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 +; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s19, v1, v0 ; GFX7-NEXT: s_ashr_i32 s0, s0, 28 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0 @@ -61,43 +61,43 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s11, v1, v0 +; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s13, v1, v0 +; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s17, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -108,43 +108,43 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s9, v1, v0 +; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s11, v1, v0 +; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s13, v1, v0 +; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-NEXT: v_mad_i32_i24 v0, s17, v1, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -153,15 +153,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -171,14 +171,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -321,49 +321,49 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 +; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s2, 12 -; GFX8-NEXT: s_lshr_b32 s7, s4, 12 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX8-NEXT: s_lshr_b32 s1, s0, 12 +; GFX8-NEXT: s_lshr_b32 s7, s2, 12 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40010 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s14 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s5 +; GFX8-NEXT: s_bfe_i32 s1, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -372,49 +372,49 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-NEXT: s_lshr_b32 s1, s0, 12 +; GFX9-NEXT: s_lshr_b32 s7, s2, 12 +; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-NEXT: s_bfe_i32 s1, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v5, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s5, v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -423,49 +423,49 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 12 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 +; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-DL-NEXT: s_bfe_i32 s1, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v5, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v10, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -476,44 +476,44 @@ ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -656,21 +656,21 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s4, s0, 12 -; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40000 -; GFX8-NEXT: s_lshr_b32 s5, s1, 12 -; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40008 -; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s6, 0x40000 +; GFX8-NEXT: s_lshr_b32 s4, s6, 12 +; GFX8-NEXT: s_bfe_i32 s9, s6, 0x40004 +; GFX8-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX8-NEXT: s_lshr_b32 s1, s0, 12 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 ; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 ; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 @@ -678,28 +678,28 @@ ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s6, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s15, s6, 0x40014 ; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s13 -; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s17, s6, 0x40018 ; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v9, s15 ; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: s_ashr_i32 s6, s6, 28 ; GFX8-NEXT: v_mov_b32_e32 v10, s17 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -710,21 +710,21 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s4, s0, 12 -; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000 -; GFX9-NEXT: s_lshr_b32 s5, s1, 12 -; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s6, 0x40000 +; GFX9-NEXT: s_lshr_b32 s4, s6, 12 +; GFX9-NEXT: s_bfe_i32 s9, s6, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX9-NEXT: s_lshr_b32 s1, s0, 12 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 ; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 ; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 @@ -732,28 +732,28 @@ ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s6, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s15, s6, 0x40014 ; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s17, s6, 0x40018 ; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v9, s15 ; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: s_ashr_i32 s6, s6, 28 ; GFX9-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -764,21 +764,21 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12 -; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12 -; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s6, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s4, s6, 12 +; GFX9-DL-NEXT: s_bfe_i32 s9, s6, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 ; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 ; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 @@ -786,28 +786,28 @@ ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s6, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s15, s6, 0x40014 ; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s17, s6, 0x40018 ; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 ; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: s_ashr_i32 s6, s6, 28 ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm @@ -819,44 +819,44 @@ ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 +; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -939,39 +939,39 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000 -; GFX7-NEXT: v_mov_b32_e32 v0, s9 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1 -; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004 -; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008 -; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: v_mad_i32_i24 v0, s10, v2, v0 -; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v2, s13 -; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c -; GFX7-NEXT: v_mad_i32_i24 v0, s12, v2, v0 -; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0 -; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v2, v0 -; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s18, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 +; GFX7-NEXT: v_mad_i32_i24 v1, s2, v0, v1 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 +; GFX7-NEXT: v_mad_i32_i24 v0, s2, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mad_i32_i24 v0, s9, v2, v0 +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c +; GFX7-NEXT: v_mad_i32_i24 v0, s11, v2, v0 +; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 +; GFX7-NEXT: v_mad_i32_i24 v0, s13, v2, v0 +; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v2, s16 +; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 +; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s15, v2, v0 +; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: v_mad_i32_i24 v0, s20, v2, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s19, v2, v0 ; GFX7-NEXT: s_ashr_i32 s0, s0, 28 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: v_mad_i32_i24 v0, s0, v2, v0 @@ -986,45 +986,45 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v4, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: v_mad_i32_i24 v0, s7, v2, v0 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s9, v2, v0 +; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s11, v2, v0 +; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s13, v2, v0 +; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s15, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s17, v2, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1035,45 +1035,45 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: v_mad_i32_i24 v0, s7, v2, v0 +; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s9, v2, v0 +; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s11, v2, v0 +; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s13, v2, v0 +; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s15, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX9-NEXT: v_mad_i32_i24 v0, s17, v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1084,45 +1084,45 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s7, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s9, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s11, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s13, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-DL-NEXT: s_bfe_i32 s17, s2, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s15, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s17, v2, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1135,36 +1135,36 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s8, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s9, s10, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s11, s12, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s13, s14, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s4, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1248,17 +1248,15 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 ; GFX7-NEXT: s_lshl_b32 s11, s1, 4 -; GFX7-NEXT: s_ashr_i64 s[14:15], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 12 ; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60 ; GFX7-NEXT: s_lshl_b32 s11, s1, 16 ; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60 ; GFX7-NEXT: s_lshl_b32 s11, s1, 20 ; GFX7-NEXT: s_lshl_b32 s13, s1, 8 +; GFX7-NEXT: s_lshl_b32 s15, s1, 12 ; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60 ; GFX7-NEXT: s_lshl_b32 s11, s1, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 28 @@ -1278,10 +1276,14 @@ ; GFX7-NEXT: s_lshl_b32 s1, s9, 28 ; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 ; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 +; GFX7-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 +; GFX7-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s36 ; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s34 @@ -1289,12 +1291,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s32 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s30 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s28 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s26 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s24 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1308,56 +1309,56 @@ ; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 4 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 20 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 24 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 28 -; GFX8-NEXT: s_lshl_b32 s9, s5, 8 -; GFX8-NEXT: s_lshl_b32 s11, s5, 12 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 4 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 16 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 20 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 24 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s32 -; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s28 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 20 +; GFX8-NEXT: s_lshl_b32 s11, s5, 8 +; GFX8-NEXT: s_lshl_b32 s13, s5, 12 +; GFX8-NEXT: s_lshl_b32 s15, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s5, 28 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 4 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 8 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 16 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 +; GFX8-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s34 +; GFX8-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s32 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s30 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s28 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s26 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s24 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s20 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s24 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s22 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1369,56 +1370,56 @@ ; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 16 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 20 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 24 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 28 -; GFX9-NEXT: s_lshl_b32 s9, s5, 8 -; GFX9-NEXT: s_lshl_b32 s11, s5, 12 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 4 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 16 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 20 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 24 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 28 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s32 -; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s30 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 20 +; GFX9-NEXT: s_lshl_b32 s11, s5, 8 +; GFX9-NEXT: s_lshl_b32 s13, s5, 12 +; GFX9-NEXT: s_lshl_b32 s15, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 24 +; GFX9-NEXT: s_lshl_b32 s5, s5, 28 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 4 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 12 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 16 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 20 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 24 +; GFX9-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 28 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s28 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX9-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-NEXT: v_mov_b32_e32 v3, s24 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s20 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1430,56 +1431,56 @@ ; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8 -; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 20 +; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 8 +; GFX9-DL-NEXT: s_lshl_b32 s13, s5, 12 +; GFX9-DL-NEXT: s_lshl_b32 s15, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 24 +; GFX9-DL-NEXT: s_lshl_b32 s5, s5, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1492,48 +1493,48 @@ ; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 28 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 28 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 24 -; GFX10-DL-NEXT: s_lshl_b32 s13, s7, 24 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 28 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 28 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 24 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 20 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 20 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 16 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 20 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 20 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 16 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 16 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2 -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 ; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12 -; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2 -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 8 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 8 +; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 4 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 4 +; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[10:11], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 8 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 4 -; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 4 -; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[8:9], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s14, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[10:11], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s12, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[6:7], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s10, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s4, v2 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s6, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1579,31 +1580,31 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s0, s[10:11], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i32 s8, s0, 28 -; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40018 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40008 -; GFX7-NEXT: s_ashr_i32 s15, s1, 28 -; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40014 -; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 +; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40010 +; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s21, s0, 0x40008 +; GFX7-NEXT: s_ashr_i32 s15, s0, 28 +; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c +; GFX7-NEXT: s_ashr_i32 s8, s1, 28 +; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40018 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40014 +; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40010 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40000 ; GFX7-NEXT: v_mov_b32_e32 v4, s19 -; GFX7-NEXT: s_bfe_i32 s13, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40004 ; GFX7-NEXT: v_mov_b32_e32 v3, s20 -; GFX7-NEXT: s_bfe_i32 s14, s0, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s1, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s0, v1 +; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s1, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 ; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4 @@ -1636,68 +1637,68 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s2, 8 -; GFX8-NEXT: s_lshr_b32 s5, s4, 4 -; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: s_lshr_b32 s15, s2, 4 +; GFX8-NEXT: s_lshr_b32 s16, s2, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX8-NEXT: s_lshr_b32 s8, s0, 4 +; GFX8-NEXT: s_lshr_b32 s9, s0, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 12 -; GFX8-NEXT: s_lshr_b32 s1, s4, 12 +; GFX8-NEXT: s_lshr_b32 s7, s0, 12 +; GFX8-NEXT: s_lshr_b32 s14, s2, 12 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 20 -; GFX8-NEXT: s_lshr_b32 s1, s4, 20 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s1, s4, 28 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s13, s2, 16 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX8-NEXT: s_lshr_b32 s5, s0, 20 +; GFX8-NEXT: s_lshr_b32 s12, s2, 20 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX8-NEXT: s_lshr_b32 s1, s0, 28 +; GFX8-NEXT: s_lshr_b32 s10, s2, 28 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v13, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v11, v18, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1707,64 +1708,64 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s4, s2, 15 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s12, s2, 28 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s6, 28 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s6, 15 +; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s5, s4, 15 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s13 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v6, v4, v6 +; GFX9-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1774,64 +1775,64 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s4, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 +; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s13 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v4, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1841,65 +1842,65 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s5 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s9, s10 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s2 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v6, v7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2289,80 +2290,80 @@ ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 4 -; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s4 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s15 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 8 -; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s16 +; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s0 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v7 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 20 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 20 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s0 -; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s1 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s9 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 +; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 20 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s11 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s12 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s1 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v13 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v19, v10 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v15, v9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v9, v7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v11, v12 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v6, v12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v8 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -15,42 +15,42 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s8, s0, 28 -; GFX7-NEXT: s_lshr_b32 s15, s1, 28 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s21, s1, 0x40004 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40010 +; GFX7-NEXT: s_lshr_b32 s1, s0, 28 +; GFX7-NEXT: s_lshr_b32 s11, s10, 28 +; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 +; GFX7-NEXT: s_and_b32 s10, s10, 15 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c ; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 ; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 ; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s16 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: v_mov_b32_e32 v1, s16 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -60,44 +60,44 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s4, s2, 28 +; GFX8-NEXT: s_lshr_b32 s7, s6, 28 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_and_b32 s6, s6, 15 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s12 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -107,44 +107,44 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_lshr_b32 s7, s6, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_and_b32 s6, s6, 15 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -153,15 +153,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -171,14 +171,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -304,46 +304,46 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_and_b32 s1, s0, 15 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -352,46 +352,46 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s1, s2, 15 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_and_b32 s1, s0, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -400,46 +400,46 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -449,38 +449,38 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -606,46 +606,46 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_and_b32 s1, s0, 15 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -654,46 +654,46 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s1, s2, 15 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_and_b32 s1, s0, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -702,46 +702,46 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 +; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -751,38 +751,38 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -909,48 +909,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: s_and_b32 s1, s0, 15 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_lshr_b32 s11, s2, 28 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -960,48 +960,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s1, s2, 15 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: s_and_b32 s1, s0, 15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: s_lshr_b32 s11, s2, 28 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1011,48 +1011,48 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm @@ -1063,40 +1063,40 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1208,48 +1208,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: s_and_b32 s1, s0, 15 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_lshr_b32 s11, s2, 28 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1259,48 +1259,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s1, s2, 15 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: s_and_b32 s1, s0, 15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: s_lshr_b32 s11, s2, 28 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1310,48 +1310,48 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm @@ -1362,40 +1362,40 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s0, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1460,43 +1460,43 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s8, s0, 28 -; GFX7-NEXT: s_bfe_u32 s21, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s15, s1, 28 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40008 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40010 +; GFX7-NEXT: s_lshr_b32 s1, s0, 28 +; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 +; GFX7-NEXT: s_lshr_b32 s11, s10, 28 +; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 +; GFX7-NEXT: s_and_b32 s10, s10, 15 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c ; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 ; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 ; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v1, s21 ; GFX7-NEXT: v_mad_u32_u24 v1, s0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX7-NEXT: v_mad_u32_u24 v1, s14, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mad_u32_u24 v1, s13, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 -; GFX7-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s13, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-NEXT: v_mad_u32_u24 v1, s10, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: v_mad_u32_u24 v1, s2, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: v_mad_u32_u24 v1, s1, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -1507,46 +1507,46 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s4, s2, 28 +; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_lshr_b32 s7, s6, 28 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX8-NEXT: s_and_b32 s6, s6, 15 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s15 -; GFX8-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 -; GFX8-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1556,46 +1556,46 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_lshr_b32 s7, s6, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-NEXT: s_and_b32 s6, s6, 15 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1605,46 +1605,46 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s4, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1657,36 +1657,36 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v0 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s7, s8, v3 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s9, s10, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s11, s12, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s13, s14, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1768,42 +1768,42 @@ ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s8, s0, 28 -; GFX7-NEXT: s_lshr_b32 s15, s1, 28 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s21, s1, 0x40004 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40010 +; GFX7-NEXT: s_lshr_b32 s1, s0, 28 +; GFX7-NEXT: s_lshr_b32 s11, s10, 28 +; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 +; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 +; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 +; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c +; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 +; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 +; GFX7-NEXT: s_and_b32 s10, s10, 15 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c ; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 ; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 ; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s1 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, s10 ; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s16 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: v_mov_b32_e32 v1, s16 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1813,44 +1813,44 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s4, s2, 28 +; GFX8-NEXT: s_lshr_b32 s7, s6, 28 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_and_b32 s6, s6, 15 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s12 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1860,44 +1860,44 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_lshr_b32 s7, s6, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_and_b32 s6, s6, 15 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1906,15 +1906,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1924,14 +1924,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2032,46 +2032,46 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_and_b32 s1, s0, 15 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s9 +; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2080,53 +2080,53 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s1, s2, 15 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s2, 28 +; GFX9-NEXT: s_and_b32 s4, s0, 15 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s12, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s2 ; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-NEXT: v_pk_mul_lo_u16 v6, s1, v6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2135,53 +2135,53 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 +; GFX9-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s12, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s2 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s7 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s1, v6 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v5 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2191,42 +2191,42 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s7, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s8 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s7, s8 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s1, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s5 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 @@ -2553,59 +2553,59 @@ ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 -; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s6, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s8, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s7, s9 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s5 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 +; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s7, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s0, s4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v11, v5, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v11 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2696,48 +2696,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s1, s2, 15 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX8-NEXT: s_and_b32 s1, s0, 15 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: s_lshr_b32 s11, s2, 28 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2747,48 +2747,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s1, s2, 15 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-NEXT: s_and_b32 s1, s0, 15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: s_lshr_b32 s11, s2, 28 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -2798,48 +2798,48 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 +; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm @@ -2850,40 +2850,40 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -2927,41 +2927,41 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, 15 -; GFX7-NEXT: s_and_b32 s8, s5, 15 -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40004 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s4, 0x4000c -; GFX7-NEXT: s_bfe_u32 s15, s4, 0x40010 -; GFX7-NEXT: s_bfe_u32 s17, s4, 0x40014 -; GFX7-NEXT: s_bfe_u32 s19, s4, 0x40018 +; GFX7-NEXT: s_and_b32 s6, s4, 15 +; GFX7-NEXT: s_and_b32 s7, s5, 15 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s16, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s18, s4, 0x40018 ; GFX7-NEXT: s_lshr_b32 s4, s4, 28 -; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x40004 -; GFX7-NEXT: s_bfe_u32 s12, s5, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s5, 0x4000c -; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40014 -; GFX7-NEXT: s_bfe_u32 s20, s5, 0x40018 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1 +; GFX7-NEXT: s_bfe_u32 s9, s5, 0x40004 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40018 ; GFX7-NEXT: s_lshr_b32 s5, s5, 28 ; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_u32_u24 v0, s16, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s18, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s19 -; GFX7-NEXT: v_mad_u32_u24 v0, s20, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s10 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s12 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s15, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: v_mad_u32_u24 v0, s17, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s18 +; GFX7-NEXT: v_mad_u32_u24 v0, s19, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2972,43 +2972,43 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s3, 15 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40018 +; GFX8-NEXT: s_and_b32 s4, s2, 15 +; GFX8-NEXT: s_and_b32 s5, s3, 15 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v2, v3 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x40004 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s3, 0x4000c -; GFX8-NEXT: s_bfe_u32 s12, s3, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s3, 0x40014 -; GFX8-NEXT: s_bfe_u32 s16, s3, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_u32 s7, s3, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s3, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s3, 0x4000c +; GFX8-NEXT: s_bfe_u32 s13, s3, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s3, 0x40014 +; GFX8-NEXT: s_bfe_u32 s17, s3, 0x40018 ; GFX8-NEXT: s_lshr_b32 s3, s3, 28 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s16, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s15, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v2, s17, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3019,43 +3019,43 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40018 +; GFX9-NEXT: s_and_b32 s4, s2, 15 +; GFX9-NEXT: s_and_b32 s5, s3, 15 +; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v2, v3 -; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s5, v0, v1 +; GFX9-NEXT: s_bfe_u32 s7, s3, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s3, 0x40018 ; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s16, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mad_u32_u24 v0, s13, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s15, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v2, s17, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -3065,14 +3065,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -3082,14 +3082,14 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -562,31 +562,32 @@ ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v1, v1, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_max_f32_e32 v2, v2, v5 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -615,31 +615,32 @@ ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 -; SI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 -; SI-NEXT: s_lshr_b32 s4, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s5 +; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 +; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v1, v1, v5 ; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_min_f32_e32 v2, v2, v5 ; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -10,8 +10,8 @@ ; GCN-LABEL: {{^}}madak_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] @@ -101,8 +101,8 @@ ; GCN-LABEL: {{^}}madak_inline_imm_f32: ; GFX6: buffer_load_dword [[VA:v[0-9]+]] ; GFX6: buffer_load_dword [[VB:v[0-9]+]] -; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]] +; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -16,13 +16,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v3, v2 +; VI-NEXT: v_max_i16_e32 v2, v5, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -38,13 +38,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ushort v5, v[0:1], off ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v3, v2 +; GFX9-NEXT: v_max_i16_e32 v2, v5, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -73,15 +73,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v3, v2 -; VI-NEXT: v_max_i16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_max_i16_e32 v3, v5, v2 +; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -97,13 +97,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v3, v2 +; GFX9-NEXT: v_pk_max_i16 v2, v5, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -124,35 +124,35 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v8 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v2 -; VI-NEXT: flat_load_dword v9, v[0:1] -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v4, v[4:5] -; VI-NEXT: flat_load_dword v5, v[2:3] -; VI-NEXT: flat_load_ushort v6, v[6:7] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v8 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v7, s5 +; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; VI-NEXT: flat_load_dword v8, v[0:1] +; VI-NEXT: flat_load_ushort v9, v[4:5] +; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v7, v5, v9 -; VI-NEXT: v_max_i16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i16_e32 v1, v8, v2 +; VI-NEXT: v_max_i16_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v6, v4 -; VI-NEXT: v_or_b32_e32 v5, v7, v5 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: v_max_i16_e32 v0, v9, v0 +; VI-NEXT: flat_store_dword v[6:7], v1 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: @@ -167,19 +167,20 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v6, v[2:3], off -; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v8, v6 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_pk_max_i16 v7, v7, v6 -; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 +; GFX9-NEXT: v_pk_max_i16 v6, v6, v7 +; GFX9-NEXT: global_load_short_d16 v7, v[2:3], off offset:4 ; GFX9-NEXT: global_load_short_d16 v8, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v8, v6 -; GFX9-NEXT: global_store_dword v[4:5], v7, off +; GFX9-NEXT: v_pk_max_i16 v0, v8, v7 +; GFX9-NEXT: global_store_dword v[4:5], v6, off ; GFX9-NEXT: global_store_short v[4:5], v0, off offset:4 ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -208,8 +209,8 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc @@ -235,8 +236,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc @@ -271,13 +272,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v3, v2 +; VI-NEXT: v_max_i16_e32 v2, v5, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -293,13 +294,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ushort v5, v[0:1], off ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v3, v2 +; GFX9-NEXT: v_max_i16_e32 v2, v5, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -328,13 +329,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v3, v2 +; VI-NEXT: v_max_u16_e32 v2, v5, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -350,13 +351,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ushort v5, v[0:1], off ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v3, v2 +; GFX9-NEXT: v_max_u16_e32 v2, v5, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -385,13 +386,13 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_ushort v5, v[0:1] ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v3, v2 +; VI-NEXT: v_max_u16_e32 v2, v5, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -407,13 +408,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ushort v5, v[0:1], off ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v3, v2 +; GFX9-NEXT: v_max_u16_e32 v2, v5, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -441,15 +442,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v5, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v4, v3, v2 -; VI-NEXT: v_max_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 +; VI-NEXT: v_max_u16_e32 v3, v5, v2 +; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -465,13 +466,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v5, v[0:1], off ; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v2, v3, v2 +; GFX9-NEXT: v_pk_max_u16 v2, v5, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -393,11 +393,11 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -73,7 +73,7 @@ ; GCN-LABEL: {{^}}mul_v2i16: ; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}} ; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}} -; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]] +; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]] ; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]] ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]] ; NOSDWA-NOT: v_mul_u32_u24_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -78,7 +78,7 @@ ; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}} ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]] ; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]] -; GCN: s_mov_b32 m0, -1 +; GCN-DAG: s_mov_b32 m0, -1 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256 ; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256 diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: buffer_store_dword [[SEXT]] define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { @@ -127,7 +127,7 @@ ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64: ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] -; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]] ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -66,39 +66,39 @@ ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s4, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s6, s[6:7], 0x0 -; VI-NEXT: s_load_dword s7, s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_load_dword s4, s[6:7], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s6, 16 -; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: s_lshr_b32 s6, s4, 16 +; VI-NEXT: s_lshr_b32 s7, s5, 16 ; VI-NEXT: s_sub_i32 s4, s4, s5 -; VI-NEXT: s_sub_i32 s6, s6, s7 -; VI-NEXT: s_and_b32 s5, s6, 0xffff -; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: s_sub_i32 s5, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_lshl_b32 s5, s5, 16 +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -614,12 +614,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v2 +; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 @@ -642,15 +642,15 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v1, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_sub_u16_e32 v0, v0, v2 -; VI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v0, v1 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -106,13 +106,13 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[4:5], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s1, s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -19,11 +19,11 @@ ; GFX9-LABEL: shuffle_v4f16_234u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -114,15 +114,15 @@ ; GFX9-LABEL: shuffle_v4f16_35u5: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -134,15 +134,15 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -181,10 +181,10 @@ ; GFX9-LABEL: shuffle_v4f16_0145: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -196,10 +196,11 @@ ; GFX9-LABEL: shuffle_v4f16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -239,11 +240,11 @@ ; GFX9-LABEL: shuffle_v4f16_2345: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -255,11 +256,10 @@ ; GFX9-LABEL: shuffle_v4f16_2367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -271,11 +271,10 @@ ; GFX9-LABEL: shuffle_v4f16_4501: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[3:4], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -287,8 +286,8 @@ ; GFX9-LABEL: shuffle_v4f16_4523: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -329,11 +328,11 @@ ; GFX9-LABEL: shuffle_v4f16_6701: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -345,8 +344,8 @@ ; GFX9-LABEL: shuffle_v4f16_6723: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -390,9 +389,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v5 @@ -409,9 +408,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -424,15 +423,15 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -444,15 +443,15 @@ ; GFX9-LABEL: shuffle_v4f16_5634: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v3 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -464,16 +463,16 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -487,9 +486,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, v5 @@ -504,10 +503,11 @@ ; GFX9-LABEL: shuffle_v4i16_0167: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x i16>, <4 x i16> addrspace(1)* %arg0 %val1 = load <4 x i16>, <4 x i16> addrspace(1)* %arg1 @@ -571,12 +571,12 @@ ; GFX9-LABEL: shuffle_v4f16_6161: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -651,10 +651,8 @@ ; GFX9-LABEL: shuffle_v8f16_4589: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: global_load_dword v1, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 @@ -667,10 +665,8 @@ ; GFX9-LABEL: shuffle_v8f16_10_11_2_3: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0 @@ -685,9 +681,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; GFX9-NEXT: global_load_dword v1, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -732,10 +728,13 @@ ; GFX9-LABEL: shuffle_v6f16_452367: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v3, v[3:4], off +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <6 x half>, <6 x half> addrspace(1)* %arg0 @@ -760,9 +759,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, s5 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1] ; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1] @@ -803,14 +802,15 @@ ; GFX9-LABEL: shuffle_v4f16_0456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -13,7 +13,7 @@ ; DEFAULT: buffer_load_format_xyzw ; DEFAULT: s_waitcnt vmcnt(0) ; DEFAULT: exp -; DEFAULT-NEXT: exp +; DEFAULT: exp ; DEFAULT-NEXT: s_endpgm define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 { main_body: