Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -451,6 +451,9 @@ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) return false; + // Scheduler sends the cluster length less one. + ++NumLoads; + const MachineOperand *FirstDst = nullptr; const MachineOperand *SecondDst = nullptr; Index: llvm/test/CodeGen/AMDGPU/bitreverse.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -116,36 +116,37 @@ define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dword v0, v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid @@ -191,38 +192,39 @@ define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfrev_b32_e32 v1, v1 ; SI-NEXT: v_bfrev_b32_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid @@ -382,22 +384,23 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s6, 0xcccccccc -; SI-NEXT: s_mov_b32 s8, 0x55555555 -; SI-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xff00 +; SI-NEXT: s_mov_b32 s5, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s6, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s7, 0x33333333 +; SI-NEXT: s_mov_b32 s8, 0xcccccccc +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s9, 0x55555555 +; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 ; SI-NEXT: v_alignbit_b32 v4, v1, v0, 24 @@ -410,61 +413,62 @@ ; SI-NEXT: v_and_b32_e32 v0, 0xff0000, v0 ; SI-NEXT: v_and_b32_e32 v4, 0xff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff000000, v5 -; SI-NEXT: v_and_b32_e32 v7, s0, v7 +; SI-NEXT: v_and_b32_e32 v7, s4, v7 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v5, v7, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v1, v4, v5 ; SI-NEXT: v_or_b32_e32 v3, v0, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v1 -; SI-NEXT: v_and_b32_e32 v2, s2, v1 -; SI-NEXT: v_and_b32_e32 v1, s1, v3 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s5, v1 +; SI-NEXT: v_and_b32_e32 v2, s6, v1 +; SI-NEXT: v_and_b32_e32 v1, s5, v3 +; SI-NEXT: v_and_b32_e32 v3, s6, v3 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 +; SI-NEXT: v_and_b32_e32 v1, s7, v3 +; SI-NEXT: v_and_b32_e32 v0, s7, v2 +; SI-NEXT: v_and_b32_e32 v3, s8, v3 +; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s8, v3 -; SI-NEXT: v_and_b32_e32 v0, s8, v2 -; SI-NEXT: v_and_b32_e32 v3, s9, v3 -; SI-NEXT: v_and_b32_e32 v2, s9, v2 +; SI-NEXT: v_and_b32_e32 v1, s9, v3 +; SI-NEXT: v_and_b32_e32 v0, s9, v2 +; SI-NEXT: v_and_b32_e32 v3, s10, v3 +; SI-NEXT: v_and_b32_e32 v2, s10, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: v_mov_b32_e32 v4, 8 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s5, 0xf0f0f0f0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s6, 0x55555555 -; FLAT-NEXT: s_mov_b32 s8, 0xaaaaaaaa -; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, 0x33333333 +; FLAT-NEXT: s_mov_b32 s7, 0xcccccccc +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s8, 0x55555555 +; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_lshlrev_b64 v[2:3], 24, v[0:1] ; FLAT-NEXT: v_alignbit_b32 v2, v1, v0, 24 @@ -483,32 +487,31 @@ ; FLAT-NEXT: v_or_b32_e32 v0, v4, v0 ; FLAT-NEXT: v_or_b32_sdwa v2, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; FLAT-NEXT: v_or_b32_e32 v3, v0, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v1 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s4, v1 +; FLAT-NEXT: v_and_b32_e32 v2, s5, v1 +; FLAT-NEXT: v_and_b32_e32 v1, s4, v3 +; FLAT-NEXT: v_and_b32_e32 v3, s5, v3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s6, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s6, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s7, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s7, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s6, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s6, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid @@ -778,23 +781,23 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xff00 +; SI-NEXT: s_mov_b32 s5, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s6, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s7, 0x33333333 ; SI-NEXT: s_mov_b32 s8, 0xcccccccc ; SI-NEXT: s_mov_b32 s9, 0x55555555 ; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], 8 ; SI-NEXT: v_alignbit_b32 v6, v3, v2, 24 @@ -819,7 +822,7 @@ ; SI-NEXT: v_mov_b32_e32 v7, 0xff00 ; SI-NEXT: v_and_b32_e32 v2, v0, v11 ; SI-NEXT: v_and_b32_e32 v11, v0, v12 -; SI-NEXT: v_and_b32_e32 v9, s0, v9 +; SI-NEXT: v_and_b32_e32 v9, s4, v9 ; SI-NEXT: v_and_b32_e32 v12, 0xff000000, v13 ; SI-NEXT: v_and_b32_e32 v0, v0, v17 ; SI-NEXT: v_and_b32_e32 v13, v7, v15 @@ -828,7 +831,7 @@ ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v4, s0, v4 +; SI-NEXT: v_and_b32_e32 v4, s4, v4 ; SI-NEXT: v_or_b32_e32 v7, v16, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v9, v12, v11 @@ -838,14 +841,14 @@ ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v1 ; SI-NEXT: v_or_b32_e32 v3, v2, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v6 -; SI-NEXT: v_and_b32_e32 v2, s2, v6 -; SI-NEXT: v_and_b32_e32 v4, s1, v5 -; SI-NEXT: v_and_b32_e32 v6, s2, v5 -; SI-NEXT: v_and_b32_e32 v5, s1, v7 -; SI-NEXT: v_and_b32_e32 v7, s2, v7 -; SI-NEXT: v_and_b32_e32 v1, s1, v3 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s5, v6 +; SI-NEXT: v_and_b32_e32 v2, s6, v6 +; SI-NEXT: v_and_b32_e32 v4, s5, v5 +; SI-NEXT: v_and_b32_e32 v6, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, s5, v7 +; SI-NEXT: v_and_b32_e32 v7, s6, v7 +; SI-NEXT: v_and_b32_e32 v1, s5, v3 +; SI-NEXT: v_and_b32_e32 v3, s6, v3 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 @@ -854,12 +857,12 @@ ; SI-NEXT: v_or_b32_e32 v6, v6, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v5, s3, v7 -; SI-NEXT: v_and_b32_e32 v4, s3, v6 +; SI-NEXT: v_and_b32_e32 v5, s7, v7 +; SI-NEXT: v_and_b32_e32 v4, s7, v6 ; SI-NEXT: v_and_b32_e32 v7, s8, v7 ; SI-NEXT: v_and_b32_e32 v6, s8, v6 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 +; SI-NEXT: v_and_b32_e32 v1, s7, v3 +; SI-NEXT: v_and_b32_e32 v0, s7, v2 ; SI-NEXT: v_and_b32_e32 v3, s8, v3 ; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 @@ -886,29 +889,30 @@ ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v1, v5, v8 ; SI-NEXT: v_or_b32_e32 v0, v4, v7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: v_mov_b32_e32 v8, 8 ; FLAT-NEXT: v_mov_b32_e32 v10, 0xff0000 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s4, 0xf0f0f0f ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s3 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s1, 0x33333333 -; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s5, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s6, 0x33333333 +; FLAT-NEXT: s_mov_b32 s7, 0xcccccccc +; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; FLAT-NEXT: s_mov_b32 s8, 0x55555555 ; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 24, v[2:3] ; FLAT-NEXT: v_lshlrev_b32_sdwa v12, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -944,14 +948,14 @@ ; FLAT-NEXT: v_or_b32_e32 v7, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v5, v5, v12 ; FLAT-NEXT: v_or_b32_e32 v8, v6, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v3, s0, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s2, v5 -; FLAT-NEXT: v_and_b32_e32 v6, s0, v5 -; FLAT-NEXT: v_and_b32_e32 v5, s2, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s0, v8 +; FLAT-NEXT: v_and_b32_e32 v0, s4, v3 +; FLAT-NEXT: v_and_b32_e32 v1, s4, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s5, v3 +; FLAT-NEXT: v_and_b32_e32 v3, s5, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s4, v5 +; FLAT-NEXT: v_and_b32_e32 v6, s5, v5 +; FLAT-NEXT: v_and_b32_e32 v5, s4, v8 +; FLAT-NEXT: v_and_b32_e32 v7, s5, v8 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] @@ -960,14 +964,14 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s1, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s1, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s6, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s6, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s6, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s6, v6 +; FLAT-NEXT: v_and_b32_e32 v3, s7, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s7, v2 +; FLAT-NEXT: v_and_b32_e32 v7, s7, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s7, v6 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -992,7 +996,7 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v1, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v0, v6, v4 -; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -864,22 +864,22 @@ ; GCN-LABEL: {{^}}stack_8xv5i32: -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 -; GCN: buffer_store_dword [[REG8]], {{.*$}} -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-DAG: v_mov_b32_e32 [[REG8:v[0-9]+]], 8 +; GCN-DAG: v_mov_b32_e32 [[REG9:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[REG10:v[0-9]+]], 10 +; GCN-DAG: v_mov_b32_e32 [[REG11:v[0-9]+]], 11 +; GCN-DAG: v_mov_b32_e32 [[REG12:v[0-9]+]], 12 +; GCN-DAG: buffer_store_dword [[REG8]], {{.*$}} +; GCN-DAG: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-DAG: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-DAG: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-DAG: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-DAG: v_mov_b32_e32 [[REG13:v[0-9]+]], 13 +; GCN-DAG: v_mov_b32_e32 [[REG14:v[0-9]+]], 14 +; GCN-DAG: v_mov_b32_e32 [[REG15:v[0-9]+]], 15 +; GCN-DAG: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-DAG: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-DAG: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 7 ; GCN: s_getpc @@ -898,22 +898,22 @@ } ; GCN-LABEL: {{^}}stack_8xv5f32: -; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 -; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 -; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 -; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 -; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 -; GCN: buffer_store_dword [[REG8]], {{.*$}} -; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4 -; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8 -; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12 -; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16 -; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 -; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 -; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 -; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20 -; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24 -; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28 +; GCN-DAG: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000 +; GCN-DAG: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000 +; GCN-DAG: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000 +; GCN-DAG: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000 +; GCN-DAG: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000 +; GCN-DAG: buffer_store_dword [[REG8]], {{.*$}} +; GCN-DAG: buffer_store_dword [[REG9]], {{.*}} offset:4 +; GCN-DAG: buffer_store_dword [[REG10]], {{.*}} offset:8 +; GCN-DAG: buffer_store_dword [[REG11]], {{.*}} offset:12 +; GCN-DAG: buffer_store_dword [[REG12]], {{.*}} offset:16 +; GCN-DAG: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000 +; GCN-DAG: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000 +; GCN-DAG: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000 +; GCN-DAG: buffer_store_dword [[REG13]], {{.*}} offset:20 +; GCN-DAG: buffer_store_dword [[REG14]], {{.*}} offset:24 +; GCN-DAG: buffer_store_dword [[REG15]], {{.*}} offset:28 ; GCN: v_mov_b32_e32 v31, 0x40e00000 ; GCN: s_getpc Index: llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -50,42 +50,44 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x2: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -229,77 +231,78 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_movk_i32 s1, 0xff +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s8, 0xff00 +; SI-NEXT: s_movk_i32 s9, 0xff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: v_and_b32_e32 v2, s0, v0 +; SI-NEXT: v_and_b32_e32 v2, s8, v0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 +; SI-NEXT: v_and_b32_e32 v0, s9, v0 +; SI-NEXT: v_and_b32_e32 v3, s8, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s1, v1 +; SI-NEXT: v_and_b32_e32 v1, s9, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_movk_i32 s8, 0xff00 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s9, 0xff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_movk_i32 s9, 0xff -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_movk_i32 s10, 0x900 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s4, s2 +; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s2, s6 +; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s8, v1 +; VI-NEXT: v_and_b32_e32 v4, s8, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 +; VI-NEXT: v_add_u16_e32 v3, 9, v0 ; VI-NEXT: v_and_b32_e32 v1, s9, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_and_b32_e32 v2, s8, v0 -; VI-NEXT: v_add_u16_e32 v0, 9, v0 -; VI-NEXT: v_and_b32_e32 v0, s9, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_and_b32_e32 v3, s9, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_add_u16_e32 v1, s10, v1 -; VI-NEXT: v_add_u16_e32 v0, s10, v0 +; VI-NEXT: v_add_u16_e32 v2, s10, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x Index: llvm/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -65,40 +65,41 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32: @@ -131,16 +132,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v2, v1 ; SI-NEXT: v_ffbh_u32_e32 v3, v0 @@ -148,19 +149,20 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -170,7 +172,7 @@ ; VI-NEXT: v_ffbh_u32_e32 v3, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_v2i32: @@ -206,16 +208,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v2 @@ -229,19 +231,20 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -257,7 +260,7 @@ ; VI-NEXT: v_ffbh_u32_e32 v7, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_v4i32: @@ -299,9 +302,9 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +504,6 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -509,7 +511,7 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v2 ; SI-NEXT: v_ffbh_u32_e32 v5, v3 @@ -520,7 +522,8 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64: @@ -588,7 +591,6 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -596,8 +598,8 @@ ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v4 @@ -607,7 +609,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64_trunc: @@ -676,36 +679,37 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -742,36 +746,37 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -809,35 +814,36 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -846,7 +852,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -885,35 +891,36 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -922,7 +929,7 @@ ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -961,34 +968,35 @@ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1030,9 +1038,9 @@ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,36 +1110,37 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i7_sel_eq_neg1: Index: llvm/test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -45,9 +45,9 @@ ; FUNC-LABEL: {{^}}v_ctpop_i64_user: ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 -; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; SI-DAG: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-DAG: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} Index: llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -8,34 +8,35 @@ define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_i8_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid @@ -48,38 +49,39 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v2i8_to_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid @@ -92,41 +94,42 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v3i8_to_v3f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v2 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid @@ -139,34 +142,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -174,7 +178,7 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid @@ -191,19 +195,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_unaligned: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -216,19 +220,20 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -253,7 +258,7 @@ ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid @@ -358,88 +363,90 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v7i8_to_v7f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:4 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v3 -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], 0 offset:24 +; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v9, v[2:3] ; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-NEXT: flat_load_ubyte v7, v[6:7] +; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v8, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v8 -; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v3, v4 +; VI-NEXT: v_or_b32_sdwa v1, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v2 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v2 @@ -447,8 +454,8 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid @@ -461,16 +468,16 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v8i8_to_v8f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[7:8], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -480,20 +487,21 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -505,8 +513,8 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid @@ -519,38 +527,39 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 2, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -565,36 +574,37 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_inreg_hi1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -611,34 +621,35 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: i8_zext_i32_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid @@ -652,19 +663,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v4i8_zext_v4i32_to_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:3 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -677,19 +688,20 @@ ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -713,7 +725,7 @@ ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid @@ -727,36 +739,37 @@ define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte0_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -770,36 +783,37 @@ define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte1_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -814,36 +828,37 @@ define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte2_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid @@ -858,36 +873,37 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: extract_byte3_to_f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -2406,10 +2406,10 @@ ; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] -; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]] -; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 +; GCN-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0 ; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]] ; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]] Index: llvm/test/CodeGen/AMDGPU/idot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot2.ll +++ llvm/test/CodeGen/AMDGPU/idot2.ll @@ -102,18 +102,18 @@ ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -377,18 +377,18 @@ ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -648,18 +648,18 @@ ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -854,14 +854,14 @@ ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_and_b32 s1, s4, 0xffff -; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -873,14 +873,14 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: s_and_b32 s1, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -892,14 +892,14 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1039,18 +1039,18 @@ ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1171,18 +1171,18 @@ ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -2546,88 +2546,89 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1, ; GFX7-LABEL: udot2_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s6, s4, 16 -; GFX7-NEXT: s_and_b32 s4, s4, s8 -; GFX7-NEXT: s_lshr_b32 s7, s5, 16 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s1, 16 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_and_b32 s1, s1, s8 +; GFX7-NEXT: s_and_b32 s0, s0, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_lshr_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s3, s2, s1 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_lshr_b32 s0, s0, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -2684,17 +2685,17 @@ ; GFX7-NEXT: s_mov_b32 s11, s3 ; GFX7-NEXT: s_mov_b32 s6, s2 ; GFX7-NEXT: s_mov_b32 s7, s3 -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[8:11], 0 +; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 ; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8 -; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 +; GFX7-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8 +; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s4 +; GFX7-NEXT: v_mad_i32_i24 v0, v0, v1, s4 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -2704,23 +2705,23 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2731,24 +2732,24 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off @@ -2759,24 +2760,24 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off Index: llvm/test/CodeGen/AMDGPU/idot4s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4s.ll +++ llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -108,29 +108,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s3, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -177,70 +177,70 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: idot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s6, s4 -; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s7, s5 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008 -; GFX7-NEXT: s_and_b32 s7, s7, s8 -; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010 +; GFX7-NEXT: s_sext_i32_i8 s2, s0 +; GFX7-NEXT: s_sext_i32_i8 s3, s1 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s3, s8 +; GFX7-NEXT: s_bfe_i32 s12, s1, 0x80010 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x80008 ; GFX7-NEXT: s_and_b32 s10, s10, s8 -; GFX7-NEXT: s_and_b32 s6, s6, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010 -; GFX7-NEXT: s_ashr_i32 s5, s5, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x80010 +; GFX7-NEXT: s_ashr_i32 s1, s1, 24 ; GFX7-NEXT: s_and_b32 s12, s12, s8 ; GFX7-NEXT: s_and_b32 s9, s9, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: s_ashr_i32 s4, s4, 24 +; GFX7-NEXT: s_ashr_i32 s0, s0, 24 ; GFX7-NEXT: s_and_b32 s11, s11, s8 -; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: s_and_b32 s1, s1, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s0, s0, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s3, s2 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -248,30 +248,30 @@ ; ; GFX9-NODL-LABEL: idot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v5, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off @@ -279,16 +279,16 @@ ; ; GFX9-DL-LABEL: idot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -352,112 +352,113 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: idot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s1, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s2, s1 +; GFX7-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s1, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: idot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -894,66 +895,66 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: idot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i32 s6, s4, 24 -; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010 -; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010 -; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008 -; GFX7-NEXT: s_ashr_i32 s9, s5, 24 -; GFX7-NEXT: s_sext_i32_i8 s5, s5 -; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s4, s4 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_ashr_i32 s2, s0, 24 +; GFX7-NEXT: s_bfe_i32 s10, s1, 0x80010 +; GFX7-NEXT: s_bfe_i32 s11, s1, 0x80008 +; GFX7-NEXT: s_ashr_i32 s9, s1, 24 +; GFX7-NEXT: s_sext_i32_i8 s1, s1 +; GFX7-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX7-NEXT: s_bfe_i32 s8, s0, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s7, v3, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s3, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_i32_i24 v0, s2, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80000 -; GFX8-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s3 -; GFX8-NEXT: s_bfe_i32 s3, s4, 0x80000 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80000 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x80000 +; GFX8-NEXT: s_lshr_b32 s3, s2, 16 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80000 ; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s0 ; GFX8-NEXT: s_bfe_i32 s0, s1, 0x80000 ; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s6 -; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4 -; GFX8-NEXT: s_and_b32 s4, s2, s5 +; GFX8-NEXT: s_and_b32 s1, s6, s5 +; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s2 +; GFX8-NEXT: s_bfe_i32 s2, s3, 0x80000 +; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s3 +; GFX8-NEXT: s_and_b32 s3, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v7, s1 -; GFX8-NEXT: s_and_b32 s3, s2, s3 -; GFX8-NEXT: s_and_b32 s0, s2, s0 +; GFX8-NEXT: s_and_b32 s2, s6, s2 +; GFX8-NEXT: s_and_b32 s0, s6, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v7, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -1041,17 +1042,18 @@ ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 Index: llvm/test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4u.ll +++ llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -111,29 +111,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s3, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -179,112 +179,112 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s1, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s2, s1 +; GFX7-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s1, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_movk_i32 s3, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_and_b32 s1, s0, s3 +; GFX8-NEXT: s_and_b32 s3, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc16: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s3, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s3 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -349,112 +349,113 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s1, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s6, s5 -; GFX7-NEXT: s_and_b32 s5, s4, s5 -; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s2, s1 +; GFX7-NEXT: s_bfe_u32 s8, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s1, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80010 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s1, 0xff +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX8-NEXT: s_and_b32 s3, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s0, s1 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_acc8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_movk_i32 s1, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010 +; GFX9-NODL-NEXT: s_and_b32 s3, s2, s1 +; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -512,120 +513,124 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot2_8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s7, s4, s8 -; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80008 -; GFX7-NEXT: s_and_b32 s6, s5, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s3, s0, s8 +; GFX7-NEXT: s_and_b32 s2, s1, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot2_8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s2, s0 -; GFX8-NEXT: s_and_b32 s0, s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s3, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s1, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot2_8: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s4, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s3, s0, s4 +; GFX9-NODL-NEXT: s_and_b32 s2, s1, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot2_8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s4, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s3, s2, s0 -; GFX9-DL-NEXT: s_and_b32 s0, s1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s3, s0, s4 +; GFX9-DL-NEXT: s_and_b32 s2, s1, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s1, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s3, s0, s4 +; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -652,114 +657,116 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s6, s4, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: s_and_b32 s7, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s2, s0, s8 +; GFX7-NEXT: s_and_b32 s3, s1, s8 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80010 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s3, s1, s4 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationInsideMAD: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s4, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s2, s0, s4 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -815,157 +822,161 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_CommutationAccrossMADs: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b32 s6, s4, s8 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s7, s5, s8 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008 +; GFX7-NEXT: s_and_b32 s2, s0, s8 +; GFX7-NEXT: s_and_b32 s3, s1, s8 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80010 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_bfe_u32 s11, s1, 0x80010 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s1, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s3, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_CommutationAccrossMADs: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX8-NEXT: s_and_b32 s3, s1, s0 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s3, s1, s4 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_movk_i32 s4, 0xff ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s2, s0, s4 +; GFX9-NODL-NEXT: s_and_b32 s3, s1, s4 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-DL-NEXT: s_movk_i32 s4, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_movk_i32 s0, 0xff +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX9-DL-NEXT: s_and_b32 s3, s1, s0 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s2, s0, s4 +; GFX9-DL-NEXT: s_and_b32 s3, s1, s4 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010 -; GFX9-DL-NEXT: s_and_b32 s0, s2, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 -; GFX10-DL-NEXT: s_and_b32 s5, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1415,64 +1426,64 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: notdot4_mixedtypes: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_sext_i32_i8 s7, s6 -; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008 -; GFX7-NEXT: s_sext_i32_i8 s5, s4 -; GFX7-NEXT: s_and_b32 s7, s7, s8 -; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s3, s2 +; GFX7-NEXT: s_bfe_u32 s9, s2, 0x80008 +; GFX7-NEXT: s_sext_i32_i8 s1, s0 +; GFX7-NEXT: s_and_b32 s3, s3, s8 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010 -; GFX7-NEXT: s_and_b32 s5, s5, s8 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010 -; GFX7-NEXT: s_lshr_b32 s6, s6, 24 +; GFX7-NEXT: s_bfe_u32 s11, s2, 0x80010 +; GFX7-NEXT: s_and_b32 s1, s1, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_bfe_u32 s12, s0, 0x80010 +; GFX7-NEXT: s_lshr_b32 s2, s2, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_lshr_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s0, s0, 24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v3, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: notdot4_mixedtypes: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_sext_i32_i8 s3, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX8-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: s_sext_i32_i8 s4, s0 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -1480,30 +1491,30 @@ ; ; GFX9-NODL-LABEL: notdot4_mixedtypes: ; GFX9-NODL: ; %bb.0: ; %entry -; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off @@ -1511,30 +1522,30 @@ ; ; GFX9-DL-LABEL: notdot4_mixedtypes: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -1542,16 +1553,17 @@ ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 ; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 @@ -1798,64 +1810,65 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: s_movk_i32 s7, 0xff +; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_movk_i32 s3, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008 -; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010 -; GFX7-NEXT: s_lshr_b32 s9, s6, 24 -; GFX7-NEXT: s_and_b32 s6, s6, s7 -; GFX7-NEXT: s_lshr_b32 s5, s4, 24 -; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008 -; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010 -; GFX7-NEXT: s_and_b32 s4, s4, s7 -; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: s_bfe_u32 s10, s2, 0x80008 +; GFX7-NEXT: s_bfe_u32 s12, s2, 0x80010 +; GFX7-NEXT: s_lshr_b32 s9, s2, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s3 +; GFX7-NEXT: s_lshr_b32 s1, s0, 24 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80010 +; GFX7-NEXT: s_and_b32 s0, s0, s3 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s12 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s6, s1, s0 -; GFX8-NEXT: s_and_b32 s0, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, s0 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1 -; GFX8-NEXT: s_lshr_b32 s4, s2, 24 -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s6, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s1, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s1 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s0 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1933,17 +1946,18 @@ ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 ; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 @@ -1996,35 +2010,35 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX7-LABEL: udot4_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 -; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008 -; GFX7-NEXT: s_lshr_b32 s7, s4, 16 -; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008 -; GFX7-NEXT: s_lshr_b32 s11, s5, 16 -; GFX7-NEXT: s_lshr_b32 s12, s5, 24 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX7-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX7-NEXT: s_lshr_b32 s11, s1, 16 +; GFX7-NEXT: s_lshr_b32 s12, s1, 24 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 -; GFX7-NEXT: s_lshr_b32 s9, s4, 24 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: s_lshr_b32 s9, s0, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: s_mul_i32 s4, s4, s5 +; GFX7-NEXT: s_mul_i32 s0, s0, s1 ; GFX7-NEXT: v_mul_u32_u24_e32 v1, s9, v1 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s7, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s6, v3 -; GFX7-NEXT: s_and_b32 s5, s4, s8 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s3, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s2, v3 +; GFX7-NEXT: s_and_b32 s1, s0, s8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v2, s5, v3 +; GFX7-NEXT: v_or_b32_e32 v2, s1, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 @@ -2032,33 +2046,34 @@ ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v3 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot4_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_movk_i32 s8, 0xff +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX8-NEXT: s_movk_i32 s7, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s5, s0, s7 +; GFX8-NEXT: s_and_b32 s7, s1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_and_b32 s7, s1, s8 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010 ; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX8-NEXT: s_and_b32 s5, s0, s8 ; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mul_u32_u24_e32 v4, s5, v4 ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80010 @@ -2085,34 +2100,34 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2121,49 +2136,50 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -156,29 +156,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -254,55 +254,55 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 -; GFX7-NEXT: s_and_b32 s9, s9, s0 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 -; GFX7-NEXT: s_and_b32 s11, s11, s0 -; GFX7-NEXT: s_and_b32 s8, s8, s0 +; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s9, s9, s2 +; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008 +; GFX7-NEXT: s_and_b32 s11, s11, s2 +; GFX7-NEXT: s_and_b32 s8, s8, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c -; GFX7-NEXT: s_and_b32 s13, s13, s0 -; GFX7-NEXT: s_and_b32 s10, s10, s0 +; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX7-NEXT: s_and_b32 s13, s13, s2 +; GFX7-NEXT: s_and_b32 s10, s10, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010 -; GFX7-NEXT: s_and_b32 s15, s15, s0 -; GFX7-NEXT: s_and_b32 s12, s12, s0 +; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c +; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010 +; GFX7-NEXT: s_and_b32 s15, s15, s2 +; GFX7-NEXT: s_and_b32 s12, s12, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014 -; GFX7-NEXT: s_and_b32 s17, s17, s0 -; GFX7-NEXT: s_and_b32 s14, s14, s0 +; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010 +; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014 +; GFX7-NEXT: s_and_b32 s17, s17, s2 +; GFX7-NEXT: s_and_b32 s14, s14, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s15 -; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_and_b32 s19, s19, s0 -; GFX7-NEXT: s_and_b32 s16, s16, s0 +; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018 +; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014 +; GFX7-NEXT: s_and_b32 s19, s19, s2 +; GFX7-NEXT: s_and_b32 s16, s16, s2 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: s_and_b32 s21, s21, s0 -; GFX7-NEXT: s_and_b32 s18, s18, s0 -; GFX7-NEXT: v_mov_b32_e32 v6, s19 +; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: s_and_b32 s20, s20, s0 -; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_and_b32 s21, s21, s2 +; GFX7-NEXT: s_and_b32 s18, s18, s2 +; GFX7-NEXT: v_mov_b32_e32 v6, s19 +; GFX7-NEXT: s_ashr_i32 s0, s0, 28 +; GFX7-NEXT: s_and_b32 s20, s20, s2 +; GFX7-NEXT: s_and_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v7, s21 -; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_and_b32 s0, s0, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 @@ -311,176 +311,180 @@ ; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_lshr_b32 s7, s2, 12 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_lshr_b32 s2, s0, 12 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40010 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_lshr_b32 s7, s2, 12 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40010 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v5, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 12 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40010 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-DL-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 @@ -588,55 +592,55 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 -; GFX7-NEXT: s_and_b32 s9, s9, s0 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 -; GFX7-NEXT: s_and_b32 s11, s11, s0 -; GFX7-NEXT: s_and_b32 s8, s8, s0 +; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40004 +; GFX7-NEXT: s_and_b32 s9, s9, s2 +; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40004 +; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40008 +; GFX7-NEXT: s_and_b32 s11, s11, s2 +; GFX7-NEXT: s_and_b32 s8, s8, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c -; GFX7-NEXT: s_and_b32 s13, s13, s0 -; GFX7-NEXT: s_and_b32 s10, s10, s0 +; GFX7-NEXT: s_bfe_i32 s12, s0, 0x40008 +; GFX7-NEXT: s_bfe_i32 s15, s1, 0x4000c +; GFX7-NEXT: s_and_b32 s13, s13, s2 +; GFX7-NEXT: s_and_b32 s10, s10, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010 -; GFX7-NEXT: s_and_b32 s15, s15, s0 -; GFX7-NEXT: s_and_b32 s12, s12, s0 +; GFX7-NEXT: s_bfe_i32 s14, s0, 0x4000c +; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40010 +; GFX7-NEXT: s_and_b32 s15, s15, s2 +; GFX7-NEXT: s_and_b32 s12, s12, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014 -; GFX7-NEXT: s_and_b32 s17, s17, s0 -; GFX7-NEXT: s_and_b32 s14, s14, s0 +; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40010 +; GFX7-NEXT: s_bfe_i32 s19, s1, 0x40014 +; GFX7-NEXT: s_and_b32 s17, s17, s2 +; GFX7-NEXT: s_and_b32 s14, s14, s2 ; GFX7-NEXT: v_mov_b32_e32 v4, s15 -; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_and_b32 s19, s19, s0 -; GFX7-NEXT: s_and_b32 s16, s16, s0 +; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40018 +; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40014 +; GFX7-NEXT: s_and_b32 s19, s19, s2 +; GFX7-NEXT: s_and_b32 s16, s16, s2 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: s_and_b32 s21, s21, s0 -; GFX7-NEXT: s_and_b32 s18, s18, s0 -; GFX7-NEXT: v_mov_b32_e32 v6, s19 +; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40018 ; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: s_and_b32 s20, s20, s0 -; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_and_b32 s21, s21, s2 +; GFX7-NEXT: s_and_b32 s18, s18, s2 +; GFX7-NEXT: v_mov_b32_e32 v6, s19 +; GFX7-NEXT: s_ashr_i32 s0, s0, 28 +; GFX7-NEXT: s_and_b32 s20, s20, s2 +; GFX7-NEXT: s_and_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v7, s21 -; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_and_b32 s0, s0, s2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 @@ -645,185 +649,186 @@ ; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 ; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s6, 12 -; GFX8-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX8-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40000 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_lshr_b32 s5, s0, 12 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v7, s9 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX8-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX8-NEXT: v_mov_b32_e32 v8, s13 -; GFX8-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s0, 0x40018 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX8-NEXT: v_mov_b32_e32 v9, s15 -; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s6, s6, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: idot8_acc8: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40000 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_lshr_b32 s5, s0, 12 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX9-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: idot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry +; GFX9-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-DL-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s6, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 12 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 ; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-DL-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 ; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 ; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 @@ -1634,57 +1639,58 @@ ; ; GFX8-LABEL: idot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: flat_load_ushort v2, v[0:1] -; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: s_lshr_b32 s15, s2, 4 -; GFX8-NEXT: s_lshr_b32 s16, s2, 8 +; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s0 +; GFX8-NEXT: s_lshr_b32 s15, s0, 4 +; GFX8-NEXT: s_lshr_b32 s16, s0, 8 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX8-NEXT: s_lshr_b32 s8, s0, 4 -; GFX8-NEXT: s_lshr_b32 s9, s0, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1 +; GFX8-NEXT: s_lshr_b32 s8, s1, 4 +; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s7, s0, 12 -; GFX8-NEXT: s_lshr_b32 s14, s2, 12 +; GFX8-NEXT: s_lshr_b32 s7, s1, 12 +; GFX8-NEXT: s_lshr_b32 s14, s0, 12 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14 -; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_lshr_b32 s13, s2, 16 +; GFX8-NEXT: s_lshr_b32 s6, s1, 16 +; GFX8-NEXT: s_lshr_b32 s13, s0, 16 ; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 ; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13 -; GFX8-NEXT: s_lshr_b32 s5, s0, 20 -; GFX8-NEXT: s_lshr_b32 s12, s2, 20 +; GFX8-NEXT: s_lshr_b32 s5, s1, 20 +; GFX8-NEXT: s_lshr_b32 s12, s0, 20 ; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5 ; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12 -; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_lshr_b32 s4, s1, 24 +; GFX8-NEXT: s_lshr_b32 s11, s0, 24 ; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 ; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11 -; GFX8-NEXT: s_lshr_b32 s1, s0, 28 -; GFX8-NEXT: s_lshr_b32 s10, s2, 28 +; GFX8-NEXT: s_lshr_b32 s2, s1, 28 +; GFX8-NEXT: s_lshr_b32 s10, s0, 28 ; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s2 ; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10 ; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 @@ -1710,17 +1716,21 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s2 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s9 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s5 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s13, s6, 28 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 @@ -1730,14 +1740,10 @@ ; GFX9-NEXT: s_and_b32 s18, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] @@ -1777,17 +1783,21 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s4, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s8, s9 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s4, s5 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 @@ -1797,14 +1807,10 @@ ; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] @@ -1838,15 +1844,16 @@ ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 @@ -1940,65 +1947,65 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s8, s[10:11], 0x0 -; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: s_mov_b32 s12, 0xffff +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX7-NEXT: s_bfe_i32 s16, s8, 0x40000 -; GFX7-NEXT: s_bfe_i32 s17, s8, 0x40004 -; GFX7-NEXT: s_bfe_i32 s18, s8, 0x40008 -; GFX7-NEXT: s_bfe_i32 s19, s8, 0x4000c -; GFX7-NEXT: s_bfe_i32 s20, s8, 0x40010 -; GFX7-NEXT: s_bfe_i32 s21, s8, 0x40014 -; GFX7-NEXT: s_bfe_i32 s22, s8, 0x40018 -; GFX7-NEXT: s_ashr_i32 s8, s8, 28 -; GFX7-NEXT: v_mov_b32_e32 v7, s17 +; GFX7-NEXT: s_bfe_i32 s8, s0, 0x40000 +; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40000 +; GFX7-NEXT: s_bfe_i32 s17, s1, 0x40004 +; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40008 +; GFX7-NEXT: s_bfe_i32 s19, s1, 0x4000c +; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40010 +; GFX7-NEXT: s_bfe_i32 s21, s1, 0x40014 +; GFX7-NEXT: s_bfe_i32 s22, s1, 0x40018 +; GFX7-NEXT: s_ashr_i32 s1, s1, 28 ; GFX7-NEXT: v_mov_b32_e32 v8, s16 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40008 +; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v7, s17 +; GFX7-NEXT: s_bfe_i32 s10, s0, 0x40008 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX7-NEXT: s_bfe_i32 s11, s0, 0x4000c ; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40010 +; GFX7-NEXT: s_bfe_i32 s13, s0, 0x40010 ; GFX7-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX7-NEXT: s_bfe_i32 s14, s0, 0x40014 ; GFX7-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x40018 +; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40018 ; GFX7-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s2, v1 +; GFX7-NEXT: s_ashr_i32 s0, s0, 28 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s0, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v2, s15, v2 ; GFX7-NEXT: v_mul_i32_i24_e32 v3, s14, v3 ; GFX7-NEXT: v_mul_i32_i24_e32 v9, s13, v4 -; GFX7-NEXT: v_mul_i32_i24_e32 v5, s12, v5 -; GFX7-NEXT: v_mul_i32_i24_e32 v6, s11, v6 -; GFX7-NEXT: v_mul_i32_i24_e32 v7, s10, v7 -; GFX7-NEXT: v_mul_i32_i24_e32 v8, s9, v8 +; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5 +; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6 +; GFX7-NEXT: v_mul_i32_i24_e32 v7, s9, v7 +; GFX7-NEXT: v_mul_i32_i24_e32 v8, s8, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_and_b32_e32 v9, s2, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v6, s2, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX7-NEXT: v_and_b32_e32 v8, s2, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s12, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s12, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8 @@ -2021,38 +2028,39 @@ ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s8, s1, 4 -; GFX8-NEXT: s_lshr_b32 s9, s1, 12 -; GFX8-NEXT: s_lshr_b32 s10, s1, 8 -; GFX8-NEXT: s_lshr_b32 s15, s2, 4 -; GFX8-NEXT: s_lshr_b32 s16, s2, 12 -; GFX8-NEXT: s_lshr_b32 s17, s2, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s8, s0, 4 +; GFX8-NEXT: s_lshr_b32 s9, s0, 12 +; GFX8-NEXT: s_lshr_b32 s10, s0, 8 +; GFX8-NEXT: s_lshr_b32 s15, s1, 4 +; GFX8-NEXT: s_lshr_b32 s16, s1, 12 +; GFX8-NEXT: s_lshr_b32 s17, s1, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s9 ; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s8 ; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s17 ; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s16 ; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX8-NEXT: s_lshr_b32 s4, s1, 20 -; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_lshr_b32 s6, s1, 28 -; GFX8-NEXT: s_lshr_b32 s7, s1, 24 -; GFX8-NEXT: s_lshr_b32 s11, s2, 20 -; GFX8-NEXT: s_lshr_b32 s12, s2, 16 -; GFX8-NEXT: s_lshr_b32 s13, s2, 28 -; GFX8-NEXT: s_lshr_b32 s14, s2, 24 +; GFX8-NEXT: s_lshr_b32 s4, s0, 20 +; GFX8-NEXT: s_lshr_b32 s5, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s0, 28 +; GFX8-NEXT: s_lshr_b32 s7, s0, 24 +; GFX8-NEXT: s_lshr_b32 s11, s1, 20 +; GFX8-NEXT: s_lshr_b32 s12, s1, 16 +; GFX8-NEXT: s_lshr_b32 s13, s1, 28 +; GFX8-NEXT: s_lshr_b32 s14, s1, 24 ; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s6 ; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s5 @@ -2086,11 +2094,11 @@ ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mul_u32_u24_sdwa v9, v10, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v9, v9, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, s0, v9 +; GFX8-NEXT: v_and_b32_e32 v5, s2, v9 ; GFX8-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX8-NEXT: v_or_b32_e32 v6, v5, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 @@ -2109,13 +2117,14 @@ ; ; GFX9-LABEL: idot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2197,13 +2206,14 @@ ; ; GFX9-DL-LABEL: idot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -2285,16 +2295,17 @@ ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 ; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -156,29 +156,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -254,17 +254,16 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -273,6 +272,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -301,159 +301,163 @@ ; ; GFX8-LABEL: udot8_acc16: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -556,17 +560,16 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -575,6 +578,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -603,159 +607,163 @@ ; ; GFX8-LABEL: udot8_acc8: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc8: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -858,17 +866,16 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -877,6 +884,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -906,168 +914,172 @@ ; ; GFX8-LABEL: udot8_acc4: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -1157,17 +1169,16 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -1176,6 +1187,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -1205,168 +1217,172 @@ ; ; GFX8-LABEL: udot8_CommutationInsideMAD: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_CommutationInsideMAD: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_CommutationInsideMAD: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -1909,29 +1925,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1972,20 +1988,18 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40004 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c ; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40004 ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 @@ -1993,6 +2007,8 @@ ; GFX7-NEXT: s_and_b32 s19, s1, 15 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s20 ; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 @@ -2029,49 +2045,50 @@ ; ; GFX8-LABEL: udot8_acc16_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2080,53 +2097,53 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-NEXT: s_and_b32 s4, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s6, 28 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s14, s15 ; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s12, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s6, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s1, v6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v5 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s6, 15 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s8, s9 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s17 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s5, v0 +; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s6 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s5, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2135,67 +2152,68 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-DL-NEXT: s_and_b32 s4, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s14, s15 ; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s12, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s1, v6 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s9 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s16, s17 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s5, v0 +; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s10, s11 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s5, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 @@ -2273,26 +2291,26 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 ; GFX7-NEXT: s_bfe_u32 s14, s1, 0x4000c ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s18, s1, 28 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: v_mov_b32_e32 v8, s14 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40008 ; GFX7-NEXT: s_and_b32 s17, s1, 15 ; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 ; GFX7-NEXT: s_lshr_b32 s11, s0, 28 ; GFX7-NEXT: v_mov_b32_e32 v4, s18 ; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 @@ -2346,29 +2364,30 @@ ; ; GFX8-LABEL: udot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX8-NEXT: s_bfe_u32 s10, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s16, s2, 15 -; GFX8-NEXT: s_bfe_u32 s17, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40014 -; GFX8-NEXT: s_lshr_b32 s6, s1, 28 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX8-NEXT: s_lshr_b32 s13, s2, 28 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s9, s1, 15 +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s17, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_lshr_b32 s6, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_lshr_b32 s13, s1, 28 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 +; GFX8-NEXT: s_and_b32 s9, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_mov_b32_e32 v5, s10 ; GFX8-NEXT: v_mov_b32_e32 v6, s16 @@ -2377,27 +2396,27 @@ ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_u32_u24_e32 v5, s9, v6 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40018 ; GFX8-NEXT: v_mov_b32_e32 v9, s14 -; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v10, s13 ; GFX8-NEXT: v_mov_b32_e32 v11, s6 ; GFX8-NEXT: v_mov_b32_e32 v12, s12 ; GFX8-NEXT: v_mov_b32_e32 v13, s11 ; GFX8-NEXT: v_mov_b32_e32 v14, s4 -; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_mul_u32_u24_e32 v7, s7, v9 ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_mul_u32_u24_e32 v9, s5, v12 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s0, v9 +; GFX8-NEXT: v_and_b32_e32 v4, s2, v9 ; GFX8-NEXT: v_or_b32_e32 v3, v5, v3 ; GFX8-NEXT: v_or_b32_e32 v6, v4, v7 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 @@ -2416,13 +2435,14 @@ ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2482,13 +2502,14 @@ ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -2548,16 +2569,17 @@ ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 @@ -2645,17 +2667,16 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 +; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 ; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 ; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 @@ -2664,6 +2685,7 @@ ; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 ; GFX7-NEXT: s_lshr_b32 s14, s1, 28 ; GFX7-NEXT: s_and_b32 s1, s1, 15 +; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 ; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 ; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 ; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c @@ -2693,168 +2715,172 @@ ; ; GFX8-LABEL: udot8_acc4_vecMul: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 ; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 @@ -3078,18 +3104,18 @@ ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, Index: llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -205,7 +205,7 @@ ; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 ; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] ; GCN: s_mov_b32 [[K:s[0-9]+]], 0x1010101 -; GCN: s_and_b32 s3, s1, [[K]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] ; GCN: s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GCN: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1617,36 +1617,33 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { ; SI-LABEL: dynamic_insertelement_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; SI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x10 -; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 +; SI-NEXT: s_load_dword s6, s[4:5], 0x20 ; SI-NEXT: v_mov_b32_e32 v16, 64 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: s_and_b32 s4, s4, 7 -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: v_mov_b32_e32 v2, s14 -; SI-NEXT: v_mov_b32_e32 v3, s15 -; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v5, s17 -; SI-NEXT: v_mov_b32_e32 v6, s18 -; SI-NEXT: v_mov_b32_e32 v7, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s21 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s23 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s25 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s6, s6, 7 +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: s_lshl_b32 s6, s6, 3 +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_mov_b32_e32 v4, s12 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v11, s19 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v14, s22 +; SI-NEXT: v_mov_b32_e32 v15, s23 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 -; SI-NEXT: v_or_b32_e32 v16, s4, v16 +; SI-NEXT: v_or_b32_e32 v16, s6, v16 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 ; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen @@ -1654,7 +1651,10 @@ ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 ; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 @@ -1663,36 +1663,33 @@ ; ; VI-LABEL: dynamic_insertelement_v8f64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x40 -; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_load_dword s6, s[4:5], 0x80 +; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 ; VI-NEXT: v_mov_b32_e32 v16, 64 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s12 -; VI-NEXT: s_and_b32 s4, s4, 7 -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_mov_b32_e32 v2, s14 -; VI-NEXT: v_mov_b32_e32 v3, s15 -; VI-NEXT: v_mov_b32_e32 v4, s16 -; VI-NEXT: v_mov_b32_e32 v5, s17 -; VI-NEXT: v_mov_b32_e32 v6, s18 -; VI-NEXT: v_mov_b32_e32 v7, s19 -; VI-NEXT: v_mov_b32_e32 v8, s20 -; VI-NEXT: v_mov_b32_e32 v9, s21 -; VI-NEXT: v_mov_b32_e32 v10, s22 -; VI-NEXT: v_mov_b32_e32 v11, s23 -; VI-NEXT: v_mov_b32_e32 v12, s24 -; VI-NEXT: v_mov_b32_e32 v13, s25 -; VI-NEXT: v_mov_b32_e32 v14, s26 -; VI-NEXT: v_mov_b32_e32 v15, s27 +; VI-NEXT: s_and_b32 s6, s6, 7 +; VI-NEXT: v_mov_b32_e32 v2, s10 +; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: s_lshl_b32 s6, s6, 3 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v4, s12 +; VI-NEXT: v_mov_b32_e32 v5, s13 +; VI-NEXT: v_mov_b32_e32 v6, s14 +; VI-NEXT: v_mov_b32_e32 v7, s15 +; VI-NEXT: v_mov_b32_e32 v8, s16 +; VI-NEXT: v_mov_b32_e32 v9, s17 +; VI-NEXT: v_mov_b32_e32 v10, s18 +; VI-NEXT: v_mov_b32_e32 v11, s19 +; VI-NEXT: v_mov_b32_e32 v12, s20 +; VI-NEXT: v_mov_b32_e32 v13, s21 +; VI-NEXT: v_mov_b32_e32 v14, s22 +; VI-NEXT: v_mov_b32_e32 v15, s23 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 -; VI-NEXT: v_or_b32_e32 v16, s4, v16 +; VI-NEXT: v_or_b32_e32 v16, s6, v16 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 ; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen @@ -1700,7 +1697,10 @@ ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 ; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -43,11 +43,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -57,13 +57,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -73,12 +73,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_and_b32 s1, s4, 0xffff ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; CI-NEXT: s_or_b32 s0, s1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -95,11 +95,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -113,33 +113,33 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s2 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_and_b32 s0, s4, 0xffff +; CI-NEXT: s_load_dword s0, s[4:5], 0xc +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s2, 16 ; CI-NEXT: s_lshl_b32 s2, s1, 16 +; CI-NEXT: s_and_b32 s0, s0, 0xffff ; CI-NEXT: s_or_b32 s0, s0, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -162,11 +162,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -176,13 +176,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -192,12 +192,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; CI-NEXT: s_or_b32 s0, s1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -216,16 +216,16 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s1 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -234,17 +234,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s1, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s1 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -253,17 +253,17 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; CI-NEXT: s_or_b32 s1, s0, s1 -; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s1, s0 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s1 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -282,20 +282,20 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s1 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s1 +; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -304,36 +304,36 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s2, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s1 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s2 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: s_load_dword s0, s[4:5], 0x4 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshr_b32 s1, s2, 16 ; CI-NEXT: s_lshl_b32 s2, s1, 16 +; CI-NEXT: s_lshr_b32 s0, s0, 16 ; CI-NEXT: s_or_b32 s2, s0, s2 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -397,11 +397,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -411,13 +411,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: s_lshl_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -427,12 +427,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff +; CI-NEXT: s_and_b32 s0, s0, 0xffff ; CI-NEXT: s_or_b32 s0, s0, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -1103,14 +1103,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 { ; GFX9-LABEL: s_insertelement_v2i16_dynamic: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 ; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 @@ -1121,14 +1121,14 @@ ; ; VI-LABEL: s_insertelement_v2i16_dynamic: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_load_dword s0, s[4:5], 0x0 -; VI-NEXT: s_load_dword s1, s[2:3], 0x0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_load_dword s1, s[6:7], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s0, s0, 4 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 @@ -1139,14 +1139,14 @@ ; ; CI-LABEL: s_insertelement_v2i16_dynamic: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_load_dword s0, s[4:5], 0x0 -; CI-NEXT: s_load_dword s1, s[2:3], 0x0 +; CI-NEXT: s_load_dword s0, s[0:1], 0x0 +; CI-NEXT: s_load_dword s1, s[6:7], 0x0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b32 s0, s0, 4 ; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 @@ -1237,81 +1237,81 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { ; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: s_mov_b32 s7, 0x12341234 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_mov_b32 s3, 0x12341234 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3 +; GFX9-NEXT: v_bfi_b32 v2, v2, s3, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 +; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0x12341234 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: s_mov_b32 s2, 0xffff +; VI-NEXT: s_mov_b32 s3, 0x12341234 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, v2, s7, v3 +; VI-NEXT: v_bfi_b32 v2, v2, s3, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 +; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s6, 0x12341234 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; CI-NEXT: s_mov_b32 s2, 0x12341234 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 +; CI-NEXT: v_mov_b32_e32 v3, s7 +; CI-NEXT: v_add_i32_e32 v0, vcc, s6, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 +; CI-NEXT: v_mov_b32_e32 v1, s5 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 ; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, v2, s6, v3 +; CI-NEXT: v_bfi_b32 v2, v2, s2, v3 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1686,27 +1686,27 @@ ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 ; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1 -; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v1, v3, s1, v1 +; GFX9-NEXT: v_bfi_b32 v0, v2, s1, v0 +; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: @@ -1754,16 +1754,16 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_lshl_b32 s2, s6, 16 ; CI-NEXT: s_and_b32 s3, s6, s4 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_or_b32 s1, s3, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_or_b32 s0, s3, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_bfi_b32 v1, v5, s1, v1 -; CI-NEXT: v_bfi_b32 v0, v4, s1, v0 +; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 +; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -9,11 +9,11 @@ ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -30,11 +30,11 @@ ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1537,8 +1537,8 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, s2 Index: llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -155,15 +155,15 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { ; SI-LABEL: round_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s19, s0, s18 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 ; SI-NEXT: s_and_b64 s[12:13], s[10:11], s[0:1] ; SI-NEXT: s_mov_b32 s15, 0x80000 @@ -263,15 +263,15 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { ; SI-LABEL: round_v4f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s22, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s23, s0, s22 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s23 ; SI-NEXT: s_and_b64 s[16:17], s[10:11], s[0:1] ; SI-NEXT: s_mov_b32 s19, 0x80000 @@ -393,47 +393,47 @@ ; ; CI-LABEL: round_v4f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x11 -; CI-NEXT: s_brev_b32 s2, -2 +; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 +; CI-NEXT: s_brev_b32 s12, -2 ; CI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_mov_b32_e32 v4, s11 -; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 +; CI-NEXT: v_trunc_f64_e32 v[0:1], s[6:7] +; CI-NEXT: v_mov_b32_e32 v4, s7 +; CI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[4:5] ; CI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] -; CI-NEXT: v_add_f64 v[0:1], s[8:9], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v4, s9 +; CI-NEXT: v_add_f64 v[0:1], s[4:5], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v4, s5 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[0:1]|, 0.5 -; CI-NEXT: v_bfi_b32 v4, s2, v12, v4 +; CI-NEXT: v_bfi_b32 v4, s12, v12, v4 ; CI-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[14:15] -; CI-NEXT: v_mov_b32_e32 v10, s15 -; CI-NEXT: v_add_f64 v[6:7], s[14:15], -v[4:5] -; CI-NEXT: v_bfi_b32 v10, s2, v12, v10 +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[10:11] +; CI-NEXT: v_mov_b32_e32 v10, s11 +; CI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] +; CI-NEXT: v_bfi_b32 v10, s12, v12, v10 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v6, 0 ; CI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[8:9] ; CI-NEXT: v_add_f64 v[6:7], v[4:5], v[6:7] -; CI-NEXT: v_add_f64 v[4:5], s[12:13], -v[10:11] -; CI-NEXT: v_mov_b32_e32 v13, s13 +; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[10:11] +; CI-NEXT: v_mov_b32_e32 v13, s9 ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; CI-NEXT: v_bfi_b32 v12, s2, v12, v13 +; CI-NEXT: v_bfi_b32 v12, s12, v12, v13 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_add_f64 v[4:5], v[10:11], v[4:5] ; CI-NEXT: v_add_f64 v[0:1], v[8:9], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 store <4 x double> %result, <4 x double> addrspace(1)* %out @@ -443,15 +443,15 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { ; SI-LABEL: round_v8f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_movk_i32 s30, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: s_add_i32 s31, s0, s30 +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s31 ; SI-NEXT: s_and_b64 s[26:27], s[10:11], s[0:1] ; SI-NEXT: s_mov_b32 s25, 0x80000 @@ -687,12 +687,11 @@ ; ; CI-LABEL: round_v8f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x19 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: v_mov_b32_e32 v16, 0x3ff00000 +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] ; CI-NEXT: v_mov_b32_e32 v4, s11 @@ -751,6 +750,7 @@ ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[18:19] ; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[14:15]|, 0.5 ; CI-NEXT: v_add_f64 v[14:15], s[18:19], -v[16:17] +; CI-NEXT: s_mov_b32 s6, -1 ; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[14:15]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v14, 0 ; CI-NEXT: v_cndmask_b32_e64 v15, 0, v18, s[0:1] Index: llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -185,23 +185,24 @@ ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s1, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s2, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, s1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshrrev_b32_e32 v3, s2, v3 +; CI-NEXT: v_lshrrev_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -260,23 +261,24 @@ ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s1, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s2, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, s1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 -; CI-NEXT: v_lshr_b32_e32 v3, s9, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s8, v2 +; CI-NEXT: v_and_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshr_b32_e32 v3, s2, v3 +; CI-NEXT: v_lshr_b32_e32 v2, s0, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 Index: llvm/test/CodeGen/AMDGPU/madak.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/madak.ll +++ llvm/test/CodeGen/AMDGPU/madak.ll @@ -14,8 +14,8 @@ ; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 @@ -105,8 +105,8 @@ ; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]] -; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]] +; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]] ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -64,6 +64,7 @@ ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v13, s19 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -71,7 +72,6 @@ ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 @@ -79,13 +79,13 @@ ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off ; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 ; GCN-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 ; GCN-NEXT: s_endpgm bb: @@ -198,24 +198,23 @@ ; GCN-LABEL: vector_clause_indirect: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], s[2:3] +; GCN-NEXT: v_mov_b32_e32 v11, s5 +; GCN-NEXT: v_mov_b32_e32 v10, s4 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 -; GCN-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NEXT: v_mov_b32_e32 v8, s4 +; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v[8:9], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v[10:11], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[10:11], v[4:7], off offset:16 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -15,13 +15,13 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -88,6 +88,8 @@ ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 @@ -95,15 +97,13 @@ ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off @@ -224,15 +224,15 @@ ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; -; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -299,9 +299,9 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} @@ -455,12 +455,12 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} Index: llvm/test/CodeGen/AMDGPU/scratch-simple.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -29,8 +29,8 @@ ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN-DAG: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN-DAG: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -101,10 +101,10 @@ ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GCN: s_mov_b32 s2, s5 +; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -120,10 +120,10 @@ ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GCN: s_mov_b32 s2, s5 +; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx Index: llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll +++ llvm/test/CodeGen/AMDGPU/setcc-limit-load-shrink.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}const_load_no_shrink_dword_to_unaligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], s[{{[0-9:]+}}], 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @const_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x @@ -14,7 +14,7 @@ } ; GCN-LABEL: const_load_no_shrink_dword_to_aligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], s[{{[0-9:]+}}], 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @const_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(4)* %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(4)* %in, i32 %x @@ -27,7 +27,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_unaligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], s[{{[0-9:]+}}], 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10013 define amdgpu_kernel void @global_load_no_shrink_dword_to_unaligned_byte(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x @@ -40,7 +40,7 @@ } ; GCN-LABEL: global_load_no_shrink_dword_to_aligned_byte: -; GCN: s_load_dword [[LD:s[0-9]+]], +; GCN: s_load_dword [[LD:s[0-9]+]], s[{{[0-9:]+}}], 0x0 ; GCN: s_bfe_i32 s{{[0-9]+}}, [[LD]], 0x10003 define amdgpu_kernel void @global_load_no_shrink_dword_to_aligned_byte(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %x) { %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %x Index: llvm/test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -186,23 +186,24 @@ ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s9, 0xffff -; CI-NEXT: s_lshr_b32 s10, s8, 16 -; CI-NEXT: s_and_b32 s8, s8, s9 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s1, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s2, s0, 16 +; CI-NEXT: s_and_b32 s0, s0, s1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 -; CI-NEXT: v_and_b32_e32 v2, s9, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s2, v3 +; CI-NEXT: v_and_b32_e32 v2, s1, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -261,24 +262,25 @@ ; CI-LABEL: shl_s_v_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, 0 +; CI-NEXT: s_mov_b32 s11, 0xf000 +; CI-NEXT: s_mov_b32 s10, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_lshr_b32 s1, s8, 16 -; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b64 s[8:9], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s1, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[10:11] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s2, s0, 16 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, s0, v2 +; CI-NEXT: v_and_b32_e32 v3, s1, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 -; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 +; CI-NEXT: v_lshl_b32_e32 v2, s2, v2 +; CI-NEXT: v_lshl_b32_e32 v3, s0, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_and_b32_e32 v3, s0, v3 +; CI-NEXT: v_and_b32_e32 v3, s1, v3 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/sign_extend.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -294,9 +294,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -309,9 +309,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 Index: llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -3,8 +3,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-FUNC: {{^}}vccz_workaround: -; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 -; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}} +; GCN-DAG: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 +; GCN-DAG: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}} ; VCCZ-BUG: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; GCN-NOT: s_mov_b64 vcc, vcc Index: llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -148,11 +148,11 @@ ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: s_and_b32 s0, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s0 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 ; HAWAII-NEXT: ds_write_b64 v2, v[0:1] ; HAWAII-NEXT: s_endpgm ; @@ -164,11 +164,11 @@ ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: s_and_b32 s0, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s0 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 ; FIJI-NEXT: ds_write_b64 v2, v[0:1] ; FIJI-NEXT: s_endpgm ; @@ -180,9 +180,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -65,8 +65,8 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 { ; GFX9-LABEL: s_test_sub_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -485,15 +485,15 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_pk_sub_i16 v2, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/wwm-reserved.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -147,11 +147,11 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) { %tmp17 = shl i32 %index, 5 -; GFX9: buffer_load_dwordx4 +; GFX9-DAG: buffer_load_dwordx4 %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0) %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64> %tmp19 = or i32 %tmp17, 16 -; GFX9: buffer_load_dwordx2 +; GFX9-DAG: buffer_load_dwordx2 %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0) %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0 %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)