diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -230,33 +230,11 @@ // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand; - if (TopCand.Reason == BotCand.Reason) { - Cand = BotCand; - GenericSchedulerBase::CandReason TopReason = TopCand.Reason; - TopCand.Reason = NoCand; - GenericScheduler::tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); - } else { - TopCand.Reason = TopReason; - } - } else { - if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = BotCand; - } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = BotCand; - } else { - if (BotCand.Reason > TopCand.Reason) { - Cand = TopCand; - } else { - Cand = BotCand; - } - } + SchedCandidate Cand = BotCand; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); } LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -10,9 +10,9 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: s_add_u32 s2, 4, 4 -; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v2, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -203,8 +203,8 @@ ; GFX9: buffer_store_dwordx4 ; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: flat_load_dword v[[A:[0-9]+]] -; VI: flat_load_dword v[[B:[0-9]+]] +; VI-DAG: flat_load_dword v[[A:[0-9]+]] +; VI-DAG: flat_load_dword v[[B:[0-9]+]] ; VI-DAG: v_add_u16_e32 ; VI: v_add_u16_sdwa v[[ADD_HI:[0-9]+]], v[[A]], v[[B]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -245,12 +245,12 @@ ; ; GFX10-LABEL: add3_uniform_vgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0 -; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0 -; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4 +; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %a1 = fadd float %a, 1.0 %b2 = fadd float %b, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -148,7 +148,6 @@ ; GFX1064: s_or_saveexec_b64 s[4:5], -1 ; GFX1064: v_mov_b32_e32 v3, v1 ; GFX1064: v_mov_b32_e32 v4, v1 -; GFX1064: s_mov_b32 s2, -1 ; GFX1064: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064: v_add_nc_u32_e32 v2, v2, v3 ; GFX1064: v_mov_b32_e32 v3, v1 @@ -165,17 +164,18 @@ ; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf ; GFX1064: v_add_nc_u32_e32 v2, v2, v4 ; GFX1064: v_mov_b32_e32 v4, v1 -; GFX1064: v_readlane_b32 s3, v2, 31 -; GFX1064: v_mov_b32_e32 v3, s3 +; GFX1064: v_readlane_b32 s2, v2, 31 +; GFX1064: v_mov_b32_e32 v3, s2 ; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1064: v_add_nc_u32_e32 v2, v2, v4 ; GFX1064: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064: v_readlane_b32 s3, v2, 15 -; GFX1064: v_readlane_b32 s6, v2, 31 -; GFX1064: v_writelane_b32 v1, s3, 16 -; GFX1064: v_readlane_b32 s3, v2, 63 -; GFX1064: v_writelane_b32 v1, s6, 32 +; GFX1064: v_readlane_b32 s2, v2, 15 +; GFX1064: v_readlane_b32 s3, v2, 31 ; GFX1064: v_readlane_b32 s6, v2, 47 +; GFX1064: v_writelane_b32 v1, s2, 16 +; GFX1064: s_mov_b32 s2, -1 +; GFX1064: v_writelane_b32 v1, s3, 32 +; GFX1064: v_readlane_b32 s3, v2, 63 ; GFX1064: v_writelane_b32 v1, s6, 48 ; GFX1064: s_mov_b64 exec, s[4:5] ; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr-spill-to-smem.ll @@ -1,4 +1,5 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s +; REQUIRES: disabled +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -enable-misched=false -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s ; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -391,13 +391,13 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s6, 0xcccccccc -; SI-NEXT: s_mov_b32 s8, 0x55555555 -; SI-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s6, 0xff00 +; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s10, 0x33333333 +; SI-NEXT: s_mov_b32 s11, 0xcccccccc +; SI-NEXT: s_mov_b32 s0, 0x55555555 +; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], 8 ; SI-NEXT: v_alignbit_b32 v4, v1, v0, 24 @@ -410,36 +410,36 @@ ; SI-NEXT: v_and_b32_e32 v0, 0xff0000, v0 ; SI-NEXT: v_and_b32_e32 v4, 0xff0000, v4 ; SI-NEXT: v_and_b32_e32 v5, 0xff000000, v5 -; SI-NEXT: v_and_b32_e32 v7, s0, v7 +; SI-NEXT: v_and_b32_e32 v7, s6, v7 +; SI-NEXT: v_and_b32_e32 v2, s6, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, s0, v2 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_or_b32_e32 v5, v7, v6 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_or_b32_e32 v1, v4, v5 ; SI-NEXT: v_or_b32_e32 v3, v0, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v1 -; SI-NEXT: v_and_b32_e32 v2, s2, v1 -; SI-NEXT: v_and_b32_e32 v1, s1, v3 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s8, v1 +; SI-NEXT: v_and_b32_e32 v2, s9, v1 +; SI-NEXT: v_and_b32_e32 v1, s8, v3 +; SI-NEXT: v_and_b32_e32 v3, s9, v3 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 +; SI-NEXT: v_and_b32_e32 v1, s10, v3 +; SI-NEXT: v_and_b32_e32 v0, s10, v2 +; SI-NEXT: v_and_b32_e32 v3, s11, v3 +; SI-NEXT: v_and_b32_e32 v2, s11, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s8, v3 -; SI-NEXT: v_and_b32_e32 v0, s8, v2 -; SI-NEXT: v_and_b32_e32 v3, s9, v3 -; SI-NEXT: v_and_b32_e32 v2, s9, v2 +; SI-NEXT: v_and_b32_e32 v1, s0, v3 +; SI-NEXT: v_and_b32_e32 v0, s0, v2 +; SI-NEXT: v_and_b32_e32 v3, s1, v3 +; SI-NEXT: v_and_b32_e32 v2, s1, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 @@ -452,60 +452,60 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: v_mov_b32_e32 v4, 8 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f0 +; FLAT-NEXT: v_mov_b32_e32 v2, 8 +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s6, 0x55555555 -; FLAT-NEXT: s_mov_b32 s8, 0xaaaaaaaa +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_lshlrev_b64 v[2:3], 24, v[0:1] -; FLAT-NEXT: v_alignbit_b32 v2, v1, v0, 24 -; FLAT-NEXT: v_alignbit_b32 v6, v1, v0, 8 -; FLAT-NEXT: v_lshlrev_b32_sdwa v7, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 8, v[0:1] -; FLAT-NEXT: v_lshlrev_b32_e32 v4, 24, v0 +; FLAT-NEXT: v_lshlrev_b64 v[4:5], 24, v[0:1] +; FLAT-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; FLAT-NEXT: v_lshlrev_b64 v[2:3], 8, v[0:1] +; FLAT-NEXT: v_alignbit_b32 v6, v1, v0, 24 +; FLAT-NEXT: v_alignbit_b32 v7, v1, v0, 8 +; FLAT-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; FLAT-NEXT: v_and_b32_e32 v2, 0xff0000, v2 -; FLAT-NEXT: v_and_b32_e32 v6, 0xff000000, v6 ; FLAT-NEXT: v_and_b32_e32 v0, 0xff0000, v0 -; FLAT-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; FLAT-NEXT: v_or_b32_e32 v2, v6, v2 -; FLAT-NEXT: v_and_b32_e32 v3, 0xff00, v3 -; FLAT-NEXT: v_or_b32_e32 v1, v2, v1 -; FLAT-NEXT: v_or_b32_e32 v0, v4, v0 -; FLAT-NEXT: v_or_b32_sdwa v2, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FLAT-NEXT: v_and_b32_e32 v4, 0xff0000, v6 +; FLAT-NEXT: v_and_b32_e32 v6, 0xff000000, v7 +; FLAT-NEXT: v_and_b32_e32 v5, 0xff00, v5 +; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 +; FLAT-NEXT: v_or_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; FLAT-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; FLAT-NEXT: v_or_b32_e32 v4, v6, v4 +; FLAT-NEXT: v_or_b32_e32 v1, v4, v1 ; FLAT-NEXT: v_or_b32_e32 v3, v0, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v1 -; FLAT-NEXT: v_and_b32_e32 v2, s3, v1 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v1 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v1 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] +; FLAT-NEXT: s_mov_b32 s0, 0x55555555 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] +; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s6, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s6, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s8, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -778,23 +778,23 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 { ; SI-LABEL: v_brev_v2i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0xff00 +; SI-NEXT: s_mov_b32 s5, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s6, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s7, 0x33333333 ; SI-NEXT: s_mov_b32 s8, 0xcccccccc ; SI-NEXT: s_mov_b32 s9, 0x55555555 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshl_b64 v[4:5], v[2:3], 8 ; SI-NEXT: v_alignbit_b32 v6, v3, v2, 24 @@ -819,7 +819,7 @@ ; SI-NEXT: v_mov_b32_e32 v7, 0xff00 ; SI-NEXT: v_and_b32_e32 v2, v0, v11 ; SI-NEXT: v_and_b32_e32 v11, v0, v12 -; SI-NEXT: v_and_b32_e32 v9, s0, v9 +; SI-NEXT: v_and_b32_e32 v9, s4, v9 ; SI-NEXT: v_and_b32_e32 v12, 0xff000000, v13 ; SI-NEXT: v_and_b32_e32 v0, v0, v17 ; SI-NEXT: v_and_b32_e32 v13, v7, v15 @@ -828,7 +828,7 @@ ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v2, v10, v2 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v4, s0, v4 +; SI-NEXT: v_and_b32_e32 v4, s4, v4 ; SI-NEXT: v_or_b32_e32 v7, v16, v0 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_or_b32_e32 v9, v12, v11 @@ -838,14 +838,14 @@ ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_or_b32_e32 v7, v7, v1 ; SI-NEXT: v_or_b32_e32 v3, v2, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v6 -; SI-NEXT: v_and_b32_e32 v2, s2, v6 -; SI-NEXT: v_and_b32_e32 v4, s1, v5 -; SI-NEXT: v_and_b32_e32 v6, s2, v5 -; SI-NEXT: v_and_b32_e32 v5, s1, v7 -; SI-NEXT: v_and_b32_e32 v7, s2, v7 -; SI-NEXT: v_and_b32_e32 v1, s1, v3 -; SI-NEXT: v_and_b32_e32 v3, s2, v3 +; SI-NEXT: v_and_b32_e32 v0, s5, v6 +; SI-NEXT: v_and_b32_e32 v2, s6, v6 +; SI-NEXT: v_and_b32_e32 v4, s5, v5 +; SI-NEXT: v_and_b32_e32 v6, s6, v5 +; SI-NEXT: v_and_b32_e32 v5, s5, v7 +; SI-NEXT: v_and_b32_e32 v7, s6, v7 +; SI-NEXT: v_and_b32_e32 v1, s5, v3 +; SI-NEXT: v_and_b32_e32 v3, s6, v3 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 ; SI-NEXT: v_lshr_b64 v[6:7], v[6:7], 4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 @@ -854,12 +854,12 @@ ; SI-NEXT: v_or_b32_e32 v6, v6, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v5, s3, v7 -; SI-NEXT: v_and_b32_e32 v4, s3, v6 +; SI-NEXT: v_and_b32_e32 v5, s7, v7 +; SI-NEXT: v_and_b32_e32 v4, s7, v6 ; SI-NEXT: v_and_b32_e32 v7, s8, v7 ; SI-NEXT: v_and_b32_e32 v6, s8, v6 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 +; SI-NEXT: v_and_b32_e32 v1, s7, v3 +; SI-NEXT: v_and_b32_e32 v0, s7, v2 ; SI-NEXT: v_and_b32_e32 v3, s8, v3 ; SI-NEXT: v_and_b32_e32 v2, s8, v2 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 @@ -886,7 +886,8 @@ ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v1, v5, v8 ; SI-NEXT: v_or_b32_e32 v0, v4, v7 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: v_brev_v2i64: @@ -895,63 +896,63 @@ ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: v_mov_b32_e32 v8, 8 -; FLAT-NEXT: v_mov_b32_e32 v10, 0xff0000 -; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f0 -; FLAT-NEXT: s_mov_b32 s1, 0x33333333 -; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 ; FLAT-NEXT: s_mov_b32 s8, 0x55555555 ; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_lshlrev_b64 v[4:5], 24, v[2:3] +; FLAT-NEXT: v_lshlrev_b64 v[6:7], 24, v[2:3] ; FLAT-NEXT: v_lshlrev_b32_sdwa v12, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; FLAT-NEXT: v_lshlrev_b32_sdwa v15, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; FLAT-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; FLAT-NEXT: v_lshlrev_b64 v[8:9], 8, v[0:1] -; FLAT-NEXT: v_lshlrev_b64 v[6:7], 8, v[2:3] -; FLAT-NEXT: v_alignbit_b32 v4, v3, v2, 24 +; FLAT-NEXT: v_alignbit_b32 v10, v3, v2, 24 ; FLAT-NEXT: v_alignbit_b32 v11, v3, v2, 8 +; FLAT-NEXT: v_lshlrev_b64 v[4:5], 8, v[2:3] +; FLAT-NEXT: v_alignbit_b32 v13, v1, v0, 8 ; FLAT-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; FLAT-NEXT: v_or_b32_sdwa v12, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; FLAT-NEXT: v_alignbit_b32 v13, v1, v0, 24 -; FLAT-NEXT: v_alignbit_b32 v14, v1, v0, 8 +; FLAT-NEXT: v_alignbit_b32 v6, v1, v0, 24 ; FLAT-NEXT: v_lshlrev_b32_e32 v8, 24, v0 ; FLAT-NEXT: v_lshlrev_b32_e32 v15, 8, v0 +; FLAT-NEXT: v_or_b32_sdwa v12, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 24, v[0:1] -; FLAT-NEXT: v_lshlrev_b32_e32 v6, 24, v2 +; FLAT-NEXT: v_lshlrev_b32_e32 v4, 24, v2 +; FLAT-NEXT: v_mov_b32_e32 v0, 0xff0000 ; FLAT-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; FLAT-NEXT: v_and_b32_e32 v0, 0xff0000, v4 -; FLAT-NEXT: v_and_b32_e32 v4, 0xff000000, v11 -; FLAT-NEXT: v_and_b32_e32 v2, v10, v2 -; FLAT-NEXT: v_and_b32_e32 v11, v10, v13 -; FLAT-NEXT: v_or_b32_e32 v0, v4, v0 +; FLAT-NEXT: v_and_b32_e32 v2, v0, v2 ; FLAT-NEXT: v_and_b32_e32 v1, 0xff00, v1 -; FLAT-NEXT: v_and_b32_e32 v13, 0xff000000, v14 -; FLAT-NEXT: v_and_b32_e32 v4, 0xff00, v5 -; FLAT-NEXT: v_and_b32_e32 v10, v10, v15 -; FLAT-NEXT: v_or_b32_e32 v5, v13, v11 -; FLAT-NEXT: v_or_b32_e32 v2, v6, v2 -; FLAT-NEXT: v_or_b32_e32 v3, v0, v3 -; FLAT-NEXT: v_or_b32_sdwa v0, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; FLAT-NEXT: v_or_b32_e32 v6, v8, v10 +; FLAT-NEXT: v_and_b32_e32 v6, v0, v6 +; FLAT-NEXT: v_and_b32_e32 v10, 0xff0000, v10 +; FLAT-NEXT: v_and_b32_e32 v11, 0xff000000, v11 +; FLAT-NEXT: v_and_b32_e32 v13, 0xff000000, v13 +; FLAT-NEXT: v_and_b32_e32 v0, v0, v15 +; FLAT-NEXT: v_and_b32_e32 v7, 0xff00, v7 +; FLAT-NEXT: v_or_b32_e32 v10, v11, v10 +; FLAT-NEXT: v_or_b32_e32 v2, v4, v2 +; FLAT-NEXT: v_or_b32_e32 v4, v13, v6 +; FLAT-NEXT: v_or_b32_e32 v6, v8, v0 +; FLAT-NEXT: v_or_b32_sdwa v0, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; FLAT-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; FLAT-NEXT: v_or_b32_e32 v7, v2, v0 -; FLAT-NEXT: v_or_b32_e32 v5, v5, v12 +; FLAT-NEXT: v_or_b32_e32 v5, v4, v12 ; FLAT-NEXT: v_or_b32_e32 v8, v6, v1 -; FLAT-NEXT: v_and_b32_e32 v0, s2, v3 -; FLAT-NEXT: v_and_b32_e32 v1, s2, v7 -; FLAT-NEXT: v_and_b32_e32 v2, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v3, s0, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s2, v5 -; FLAT-NEXT: v_and_b32_e32 v6, s0, v5 -; FLAT-NEXT: v_and_b32_e32 v5, s2, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s0, v8 +; FLAT-NEXT: v_or_b32_e32 v7, v2, v0 +; FLAT-NEXT: v_or_b32_e32 v3, v10, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v7 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s0, v5 +; FLAT-NEXT: v_and_b32_e32 v6, s1, v5 +; FLAT-NEXT: v_and_b32_e32 v5, s0, v8 +; FLAT-NEXT: v_and_b32_e32 v7, s1, v8 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] @@ -960,10 +961,10 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s1, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 ; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 ; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 ; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,37 +8,37 @@ define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -72,16 +72,16 @@ ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 @@ -102,25 +102,25 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, 0 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s8, s4 ; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -160,61 +160,60 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 -; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s16, s2 -; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s16, s6 +; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: s_mov_b32 s0, s10 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_add_u32_e32 v0, vcc, s12, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 +; VI-NEXT: s_mov_b32 s1, s11 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -230,77 +229,76 @@ ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_movk_i32 s1, 0xff -; SI-NEXT: s_movk_i32 s2, 0x900 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s12, 0xff00 +; SI-NEXT: s_movk_i32 s13, 0xff +; SI-NEXT: s_movk_i32 s14, 0x900 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: v_and_b32_e32 v2, s0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 +; SI-NEXT: v_and_b32_e32 v4, s12, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s1, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, s12, v0 +; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, s13, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s14, v2 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_movk_i32 s10, 0xff00 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_movk_i32 s8, 0xff +; VI-NEXT: s_movk_i32 s9, 0x900 ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_movk_i32 s8, 0xff ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s9, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s10, v1 +; VI-NEXT: v_and_b32_e32 v4, s10, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 +; VI-NEXT: v_add_u16_e32 v3, 9, v0 ; VI-NEXT: v_and_b32_e32 v1, s8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_and_b32_e32 v2, s10, v0 -; VI-NEXT: v_add_u16_e32 v0, 9, v0 -; VI-NEXT: v_and_b32_e32 v0, s8, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_and_b32_e32 v3, s8, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_add_u16_e32 v1, s9, v1 -; VI-NEXT: v_add_u16_e32 v0, s9, v0 +; VI-NEXT: v_add_u16_e32 v2, s9, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -317,42 +315,42 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, 0 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s16, 0xff00 -; SI-NEXT: s_movk_i32 s17, 0xff -; SI-NEXT: s_movk_i32 s18, 0x900 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 ; SI-NEXT: s_mov_b32 s8, s4 ; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s4, 0xff00 +; SI-NEXT: s_movk_i32 s5, 0xff +; SI-NEXT: s_movk_i32 s6, 0x900 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 +; SI-NEXT: v_and_b32_e32 v4, s4, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_and_b32_e32 v2, s16, v0 -; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_and_b32_e32 v2, s4, v0 +; SI-NEXT: v_and_b32_e32 v3, s5, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, s18, v2 +; SI-NEXT: v_and_b32_e32 v1, s5, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -360,39 +358,41 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s14, 0xff00 -; VI-NEXT: s_movk_i32 s12, 0xff -; VI-NEXT: s_movk_i32 s13, 0x900 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_movk_i32 s6, 0xff00 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_movk_i32 s4, 0xff +; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_movk_i32 s5, 0x900 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s14, v1 +; VI-NEXT: v_and_b32_e32 v4, s6, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s12, v1 +; VI-NEXT: v_and_b32_e32 v1, s4, v1 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 -; VI-NEXT: v_and_b32_e32 v2, s14, v0 -; VI-NEXT: v_and_b32_e32 v3, s12, v3 +; VI-NEXT: v_and_b32_e32 v2, s6, v0 +; VI-NEXT: v_and_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s13, v1 -; VI-NEXT: v_add_u16_e32 v2, s13, v2 +; VI-NEXT: v_add_u16_e32 v1, s5, v1 +; VI-NEXT: v_add_u16_e32 v2, s5, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -407,41 +407,41 @@ define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -65,21 +65,22 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32: @@ -131,16 +132,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v2, v1 ; SI-NEXT: v_ffbh_u32_e32 v3, v0 @@ -148,7 +149,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v2i32: @@ -206,16 +208,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v2 @@ -229,7 +231,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v4i32: @@ -299,9 +302,9 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +504,6 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -509,7 +511,7 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v2 ; SI-NEXT: v_ffbh_u32_e32 v5, v3 @@ -520,7 +522,8 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64: @@ -588,7 +591,6 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -596,8 +598,8 @@ ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v4 @@ -607,7 +609,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64_trunc: @@ -615,26 +618,26 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2] +; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v4, v0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 ; VI-NEXT: v_ffbh_u32_e32 v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i64_trunc: @@ -676,19 +679,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -742,19 +746,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -809,23 +814,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -885,23 +891,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -961,18 +968,19 @@ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1030,9 +1038,9 @@ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,19 +1110,20 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -266,20 +266,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v1, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_movk_i32 s12, 0xff -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_movk_i32 s13, 0x900 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 @@ -292,8 +292,7 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v6 ; SI-NEXT: v_and_b32_e32 v7, s12, v7 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v6, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 @@ -304,44 +303,44 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s8, 0x900 -; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v5 -; VI-NEXT: v_add_u16_e32 v9, 9, v5 +; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5 +; VI-NEXT: v_add_u16_e32 v8, 9, v5 ; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 +; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -365,32 +364,33 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v3 +; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:24 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: v_or_b32_e32 v4, v5, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_or_b32_e32 v5, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_or_b32_e32 v4, v3, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 @@ -415,35 +415,32 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v11, v[2:3] -; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 6, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 6, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: flat_load_ubyte v1, v[8:9] +; VI-NEXT: flat_load_ubyte v1, v[12:13] +; VI-NEXT: flat_load_ubyte v10, v[10:11] +; VI-NEXT: flat_load_ubyte v8, v[8:9] ; VI-NEXT: flat_load_ubyte v7, v[6:7] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v11 -; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_or_b32_sdwa v1, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; VI-NEXT: v_or_b32_e32 v4, v4, v10 ; VI-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; VI-NEXT: v_or_b32_e32 v4, v4, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v4 @@ -906,42 +903,42 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; SI-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -344,8 +344,8 @@ } ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16: -; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} ; VI-NOT: v_and_b32 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}} diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,23 +7,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_uge_f64: @@ -59,23 +59,23 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_oge_f64: @@ -111,23 +111,23 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ugt_f64: @@ -163,23 +163,23 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ogt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_uge_f64: @@ -57,23 +57,23 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ugt_f64: @@ -109,23 +109,23 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ule_f64: @@ -161,23 +161,23 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ult_f64: @@ -213,23 +213,23 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_oge_f64: @@ -265,23 +265,23 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ogt_f64: @@ -317,23 +317,23 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ole_f64: @@ -369,23 +369,23 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_olt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -115,7 +115,7 @@ ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -139,7 +139,7 @@ ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]] ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN-NSZ: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -157,9 +157,9 @@ } ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: -; GCN-SAFE: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} -; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-SAFE-DAG: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1{{$}} +; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN-DAG: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[SIGNBIT]], [[A]] ; GCN-SAFE: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] @@ -312,7 +312,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -332,7 +332,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -352,7 +352,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1863,7 +1863,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1883,7 +1883,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1903,7 +1903,7 @@ ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] -; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ADD]] define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -40,7 +40,7 @@ ; unless isFabsFree returns true ; GCN-LABEL: {{^}}fneg_fabs_free_f16: -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 +; GCN: {{s_or_b32 s[0-9]+, s[0-9]+, 0x8000|s_bitset1_b32 s[0-9]+, 15}} define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) @@ -50,7 +50,7 @@ } ; GCN-LABEL: {{^}}fneg_fabs_f16: -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x8000 +; GCN: {{s_or_b32 s[0-9]+, s[0-9]+, 0x8000|s_bitset1_b32 s[0-9]+, 15}} define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) %fsub = fsub half -0.0, %fabs diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -2,11 +2,11 @@ ; Test for a conv2d like sequence of loads. -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} -; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-32{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} +; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} ; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:8{{$}} define hidden amdgpu_kernel void @simpleSaddrs(i64 addrspace(1)* %dst_image, i64 addrspace(1)* %src_image ) { diff --git a/llvm/test/CodeGen/AMDGPU/global_smrd.ll b/llvm/test/CodeGen/AMDGPU/global_smrd.ll --- a/llvm/test/CodeGen/AMDGPU/global_smrd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_smrd.ll @@ -82,9 +82,9 @@ ; CHECK-LABEL: @global_array ; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]] ; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0 -; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 -; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 -; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 +; CHECK-DAG: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0 +; CHECK-DAG: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0 +; CHECK-DAG: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] ; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]] @A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -24,25 +24,25 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, v3, s2 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB0_1 @@ -88,29 +88,29 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2 -; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_not_b32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 +; GFX9-NEXT: v_not_b32_e32 v6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, 1, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3 -; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6 +; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v4, s6, v6 ; GFX9-NEXT: v_add_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB1_1 @@ -162,15 +162,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s6, v4 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: s_add_i32 s6, s6, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 @@ -222,10 +222,10 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 @@ -275,19 +275,19 @@ ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7 +; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 @@ -326,15 +326,15 @@ ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7 +; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -90,29 +90,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -244,22 +244,22 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s3, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v2, s1, s0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -318,18 +318,18 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -341,18 +341,18 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -364,29 +364,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -446,18 +446,18 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshr_b32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_lshr_b32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -469,18 +469,18 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -492,18 +492,18 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -516,16 +516,16 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -634,29 +634,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -716,18 +716,18 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 0xffff +; GFX8-NEXT: s_and_b32 s6, s3, 0xffff ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -739,18 +739,18 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-NODL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -762,18 +762,18 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-DL-NEXT: s_and_b32 s6, s3, 0xffff ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -786,16 +786,16 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -851,14 +851,14 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s3, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, s3, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 0xffff -; GFX8-NEXT: s_lshr_b32 s1, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -870,14 +870,14 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, s3, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -889,14 +889,14 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, s3, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, s2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -906,16 +906,16 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s2, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 0xffff -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s1, s4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1024,29 +1024,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1155,29 +1155,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x4 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x4 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1306,22 +1306,22 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s7 +; GFX10-DL-NEXT: s_and_b32 s4, s4, s7 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX10-DL-NEXT: s_and_b32 s3, s4, s8 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1450,22 +1450,22 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1594,22 +1594,22 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s7, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1742,23 +1742,23 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s5 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -1821,19 +1821,19 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1845,19 +1845,19 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1869,19 +1869,19 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v2, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1894,17 +1894,17 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s3, s2, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s3, s2, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2039,23 +2039,23 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2119,19 +2119,19 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2143,19 +2143,19 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2167,19 +2167,19 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v2, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2192,17 +2192,17 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i16 s1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i16 s6, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 16 ; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2338,23 +2338,23 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2418,19 +2418,19 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i16 s0, s2 +; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_ashr_i32 s2, s2, 16 -; GFX8-NEXT: s_sext_i32_i16 s1, s3 +; GFX8-NEXT: s_sext_i32_i16 s6, s3 ; GFX8-NEXT: s_ashr_i32 s3, s3, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2442,19 +2442,19 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-NODL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-NODL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2466,19 +2466,19 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2 +; GFX9-DL-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3 +; GFX9-DL-NEXT: s_sext_i32_i16 s6, s3 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2491,17 +2491,17 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 16 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-DL-NEXT: s_ashr_i32 s6, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2 ; GFX10-DL-NEXT: s_sext_i32_i16 s3, s3 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s1, s0, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s3, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2693,25 +2693,25 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2720,26 +2720,26 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NODL-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2748,26 +2748,26 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-DL-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 +; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3 -; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 ; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -46,26 +46,26 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_sext_i32_i8 s5, s2 +; GFX8-NEXT: s_sext_i32_i8 s6, s3 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -77,26 +77,26 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s6, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s7, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -108,29 +108,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -220,29 +220,29 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s4, s3, 0x80008 -; GFX8-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v5, v2 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: s_sext_i32_i8 s3, s1 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80008 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 +; GFX8-NEXT: s_ashr_i32 s1, s1, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_ashr_i32 s0, s0, 24 +; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -251,29 +251,29 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s3, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v5, v2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -551,27 +551,27 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s0, s2 -; GFX8-NEXT: s_sext_i32_i8 s1, s3 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s8 +; GFX8-NEXT: s_sext_i32_i8 s5, s2 +; GFX8-NEXT: s_sext_i32_i8 s6, s3 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX8-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX8-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s10 ; GFX8-NEXT: s_ashr_i32 s3, s3, 24 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -583,27 +583,27 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NODL-NEXT: s_sext_i32_i8 s5, s2 +; GFX9-NODL-NEXT: s_sext_i32_i8 s6, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NODL-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX9-NODL-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -615,27 +615,27 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80008 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s5, v4, v3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80010 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX9-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80008 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x80010 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s7, v2, v1 +; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x80010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s9, v1, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -648,23 +648,23 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -749,27 +749,27 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX8-NEXT: s_ashr_i32 s5, s3, 24 -; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX8-NEXT: s_ashr_i32 s7, s3, 24 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s3, s3 -; GFX8-NEXT: s_ashr_i32 s0, s2, 24 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX8-NEXT: s_ashr_i32 s5, s2, 24 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -781,27 +781,27 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s5, s3, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-NODL-NEXT: s_ashr_i32 s7, s3, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-NODL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX9-NODL-NEXT: s_ashr_i32 s5, s2, 24 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s2, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-NODL-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -813,27 +813,27 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX9-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX9-DL-NEXT: s_ashr_i32 s5, s3, 24 -; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80010 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX9-DL-NEXT: s_ashr_i32 s7, s3, 24 +; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3 -; GFX9-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x80010 +; GFX9-DL-NEXT: s_ashr_i32 s5, s2, 24 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x80010 ; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-DL-NEXT: v_mad_i32_i24 v4, s2, v4, v5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v2, v3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -846,24 +846,24 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-DL-NEXT: s_sext_i32_i8 s0, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s1, s3 -; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s2 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s3 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s2 +; GFX10-DL-NEXT: s_sext_i32_i8 s6, s3 +; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 ; GFX10-DL-NEXT: s_bfe_i32 s5, s3, 0x80010 -; GFX10-DL-NEXT: v_mad_i32_i24 v4, s0, s1, v4 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s3, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1041,42 +1041,43 @@ ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 +; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v3 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s3, 0x80000 -; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s1, v2 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s3 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, s4 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s5 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s1, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s5 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v6, 16, v3 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -111,29 +111,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s2, s3, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -608,24 +608,25 @@ ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s3, s3, 0x80008 +; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 +; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -943,29 +944,30 @@ ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 -; GFX10-DL-NEXT: s_and_b32 s5, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX10-DL-NEXT: s_and_b32 s5, s0, s2 +; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1138,29 +1140,29 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s1, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: s_bfe_u32 s2, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1345,30 +1347,30 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s8, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s5, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80008 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s6, s2, v2 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s0, s1, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s3, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1452,29 +1454,29 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_sext_i32_i8 s4, s3 -; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX8-NEXT: s_lshr_b32 s3, s3, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_lshr_b32 s2, s2, 24 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s3, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX8-NEXT: s_lshr_b32 s1, s1, 24 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1483,29 +1485,29 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -1514,56 +1516,57 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3 -; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 -; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010 +; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 +; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s3, 0x80008 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s2 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s3 -; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1738,31 +1741,31 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s3, 0xff -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_and_b32 s7, s2, s5 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-DL-NEXT: s_and_b32 s0, s4, s3 -; GFX10-DL-NEXT: s_and_b32 s1, s5, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: s_bfe_u32 s3, s4, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s2, s5, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v4, s0, s1, v4 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, v2, v3, v4 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -1929,38 +1932,39 @@ ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s3 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 16 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, s3, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, s2, 16, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -2083,32 +2087,32 @@ ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s3, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2119,59 +2123,60 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16 -; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s2, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s0, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s3 -; GFX10-DL-NEXT: s_lshr_b32 s0, s2, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 +; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s0, s1 -; GFX10-DL-NEXT: s_lshr_b32 s0, s3, 16 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s2, s3 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v5 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s0 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -62,42 +62,42 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -109,42 +109,42 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -156,29 +156,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -321,49 +321,49 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX8-NEXT: s_lshr_b32 s1, s2, 12 -; GFX8-NEXT: s_lshr_b32 s7, s4, 12 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_lshr_b32 s2, s0, 12 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s14 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 +; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -372,49 +372,49 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX9-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7 -; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -423,97 +423,98 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -814,49 +815,50 @@ ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s5, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: s_bfe_i32 s9, s5, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: s_bfe_i32 s0, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x40010 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s11, s5, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s2, s5, 0x40018 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: s_ashr_i32 s5, s5, 28 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 +; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s11, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s12, s2, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -987,44 +989,44 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v2, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s7 -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 -; GFX8-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v4, s15 -; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX8-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX8-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 ; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v2, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1036,44 +1038,44 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX9-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX9-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1085,44 +1087,44 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3 -; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s7 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s9 -; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x40010 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v4, v2 -; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v4, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX9-DL-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: s_bfe_i32 s18, s2, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s19 ; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v2, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1135,36 +1137,36 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40000 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_i32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s12, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s1, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s14, s4, 0x40018 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s7, s8, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s9, s10, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s11, s12, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3 -; GFX10-DL-NEXT: s_ashr_i32 s0, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s1, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s13, s14, v3 -; GFX10-DL-NEXT: v_mad_i32_i24 v3, s0, s1, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v0 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 +; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s4, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1308,56 +1310,56 @@ ; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 4 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 20 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 24 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s5, 28 -; GFX8-NEXT: s_lshl_b32 s9, s5, 8 -; GFX8-NEXT: s_lshl_b32 s11, s5, 12 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 4 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 8 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 16 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 20 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 24 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s1, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s32 -; GFX8-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s28 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 20 +; GFX8-NEXT: s_lshl_b32 s11, s5, 8 +; GFX8-NEXT: s_lshl_b32 s13, s5, 12 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX8-NEXT: s_lshl_b32 s9, s5, 24 +; GFX8-NEXT: s_lshl_b32 s5, s5, 28 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 4 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 8 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 12 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 16 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 24 +; GFX8-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX8-NEXT: s_lshl_b32 s5, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s34 +; GFX8-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s32 +; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s30 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v1, s28 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s26 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s24 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s20 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s26 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s24 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s22 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1369,56 +1371,56 @@ ; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 16 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 20 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 24 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s5, 28 -; GFX9-NEXT: s_lshl_b32 s9, s5, 8 -; GFX9-NEXT: s_lshl_b32 s11, s5, 12 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 4 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 16 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 20 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 24 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX9-NEXT: s_lshl_b32 s1, s7, 28 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s32 -; GFX9-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s30 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 20 +; GFX9-NEXT: s_lshl_b32 s11, s5, 8 +; GFX9-NEXT: s_lshl_b32 s13, s5, 12 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX9-NEXT: s_lshl_b32 s9, s5, 24 +; GFX9-NEXT: s_lshl_b32 s5, s5, 28 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 4 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 12 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 16 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 20 +; GFX9-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 24 +; GFX9-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX9-NEXT: s_lshl_b32 s5, s7, 28 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: v_mov_b32_e32 v1, s28 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX9-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-NEXT: v_mov_b32_e32 v3, s24 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s20 -; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1430,56 +1432,56 @@ ; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s5, 28 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 8 -; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX9-DL-NEXT: s_lshl_b32 s1, s7, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[6:7], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[0:1], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s32 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s18, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s28 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v3, v2 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 20 +; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 8 +; GFX9-DL-NEXT: s_lshl_b32 s13, s5, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 24 +; GFX9-DL-NEXT: s_lshl_b32 s5, s5, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s26 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v3, v2 -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v3, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s20 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1492,48 +1494,48 @@ ; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 28 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 28 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 24 -; GFX10-DL-NEXT: s_lshl_b32 s13, s7, 24 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 28 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 28 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 24 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 20 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 20 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 16 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 20 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 20 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 16 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 16 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2 -; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 ; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12 -; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[0:1], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s8, v2 -; GFX10-DL-NEXT: s_lshl_b32 s1, s5, 8 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 8 +; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 8 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[10:11], 60 -; GFX10-DL-NEXT: s_lshl_b32 s9, s7, 8 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s10, s12, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX10-DL-NEXT: s_lshl_b32 s11, s5, 4 -; GFX10-DL-NEXT: s_lshl_b32 s1, s7, 4 -; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[8:9], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s14, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[10:11], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s12, v2 -; GFX10-DL-NEXT: s_ashr_i64 s[0:1], s[4:5], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[6:7], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s10, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s4, v2 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 4 +; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 4 +; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s6, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1636,68 +1638,68 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 4 -; GFX8-NEXT: s_lshr_b32 s1, s2, 8 -; GFX8-NEXT: s_lshr_b32 s5, s4, 4 -; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s6 -; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 +; GFX8-NEXT: s_lshr_b32 s8, s0, 4 +; GFX8-NEXT: s_lshr_b32 s9, s0, 8 +; GFX8-NEXT: s_lshr_b32 s15, s1, 4 +; GFX8-NEXT: s_lshr_b32 s16, s1, 8 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 -; GFX8-NEXT: s_lshr_b32 s0, s2, 12 -; GFX8-NEXT: s_lshr_b32 s1, s4, 12 +; GFX8-NEXT: s_lshr_b32 s7, s0, 12 +; GFX8-NEXT: s_lshr_b32 s14, s1, 12 ; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 -; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 16 -; GFX8-NEXT: s_lshr_b32 s6, s4, 16 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 20 -; GFX8-NEXT: s_lshr_b32 s1, s4, 20 -; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 -; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 -; GFX8-NEXT: v_lshlrev_b16_e64 v13, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s1 -; GFX8-NEXT: s_lshr_b32 s5, s2, 24 -; GFX8-NEXT: s_lshr_b32 s6, s4, 24 -; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12 -; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s5 -; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s6 -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s1, s4, 28 ; GFX8-NEXT: v_ashrrev_i16_e32 v13, 12, v13 +; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX8-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s13, s1, 16 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, v5, v12 +; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX8-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX8-NEXT: s_lshr_b32 s5, s0, 20 +; GFX8-NEXT: s_lshr_b32 s12, s1, 20 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 ; GFX8-NEXT: v_ashrrev_i16_e32 v14, 12, v14 -; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s0 -; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s1 +; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX8-NEXT: v_lshlrev_b16_e64 v16, 12, s12 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_lshr_b32 s11, s1, 24 +; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8 ; GFX8-NEXT: v_ashrrev_i16_e32 v15, 12, v15 -; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX8-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9 ; GFX8-NEXT: v_ashrrev_i16_e32 v16, 12, v16 +; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10 +; GFX8-NEXT: v_ashrrev_i16_e32 v17, 12, v17 +; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11 ; GFX8-NEXT: v_ashrrev_i16_e32 v18, 12, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v6, v13, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v13, v14, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v15, v17, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, v16, v18, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v7, v14, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v8, v15, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v9, v16, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, v11, v18, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1708,63 +1710,63 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s6 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s4, 15 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s4 +; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v1, v5 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_and_b32 s5, s4, 15 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s13 +; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v6, v4, v6 +; GFX9-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1775,131 +1777,132 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s8 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s6 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s4, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s4 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v1, v5 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v0, v4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40014 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s9, s10 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s13, s2 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s7, s8 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s13 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v7 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s11, s12 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v8, 12, v8 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v9, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v8 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s14, s4 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v9, 12, v9 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v10, 12, s0 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v9 -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v6 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_ashrrev_i16 v10, 12, v10 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v4, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 28 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 ; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s5 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s9, s10 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s2 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v8, 12, s1 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v6, 12, v7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v8 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v6, v7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2284,85 +2287,86 @@ ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 4 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 4 -; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 12 -; GFX10-DL-NEXT: s_lshr_b32 s7, s5, 12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s4 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s15 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5 -; GFX10-DL-NEXT: s_lshr_b32 s8, s4, 8 -; GFX10-DL-NEXT: s_lshr_b32 s0, s5, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s16 +; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s0 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v7 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 20 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 20 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v6, v8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s0 -; GFX10-DL-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s1 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v9 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s9 -; GFX10-DL-NEXT: s_lshr_b32 s0, s4, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s5, 24 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 +; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 20 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s11 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s12 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 +; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s1 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v15, 12, v9 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v13 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v8 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v11 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, v19, v10 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v15, v9 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v6 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v10 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v15 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v9, v7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v7 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v11, v12 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_sdwa v5, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v6, v12 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v8 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NEXT: v_or_b32_sdwa v3, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -62,42 +62,42 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s6, s2, 28 +; GFX8-NEXT: s_lshr_b32 s13, s4, 28 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s12 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -109,42 +109,42 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -156,29 +156,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -304,45 +304,45 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -352,45 +352,45 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -400,87 +400,88 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -606,45 +607,45 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -654,45 +655,45 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -702,87 +703,88 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s4 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10 -; GFX9-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40014 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s8, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s9, s10, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s11, s12, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s13, s14, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -909,47 +911,47 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -960,47 +962,47 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1011,47 +1013,47 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1059,44 +1061,45 @@ ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1208,47 +1211,47 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -1259,47 +1262,47 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1310,47 +1313,47 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1358,44 +1361,45 @@ ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s0, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s8 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -1509,44 +1513,44 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX8-NEXT: s_lshr_b32 s6, s2, 28 +; GFX8-NEXT: s_bfe_u32 s19, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s13, s4, 28 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s18, s4, 0x40008 ; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s15 -; GFX8-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s14 -; GFX8-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s13 -; GFX8-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s12 -; GFX8-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, s11 -; GFX8-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1558,44 +1562,44 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_bfe_u32 s19, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 ; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1607,44 +1611,44 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s19, s4, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s18, s4, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s4, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s2, v2, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s17 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s10, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s9, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s8, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s14 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s7, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s6, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s1, v4, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v3, s0, v4, v3 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s19 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s12, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v2, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1657,36 +1661,36 @@ ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s12, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v0 +; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 ; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s14, s4, 0x40018 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s7, s8, v3 +; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s9, s10, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s11, s12, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s5, s6, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s13, s14, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v3, s2, s4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1815,42 +1819,42 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 28 -; GFX8-NEXT: s_lshr_b32 s11, s4, 28 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX8-NEXT: s_lshr_b32 s6, s2, 28 +; GFX8-NEXT: s_lshr_b32 s13, s4, 28 +; GFX8-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX8-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX8-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX8-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX8-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX8-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX8-NEXT: s_and_b32 s4, s4, 15 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s12 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s14 +; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1862,42 +1866,42 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 28 -; GFX9-NEXT: s_lshr_b32 s11, s4, 28 -; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s4, 0x4000c -; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s4, 0x40004 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40018 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_bfe_u32 s18, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s19, s4, 0x40004 ; GFX9-NEXT: s_and_b32 s4, s4, 15 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s2, v2, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mad_u32_u24 v0, s7, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -1909,29 +1913,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s2, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s5 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2032,45 +2036,45 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s4, 0x40018 -; GFX8-NEXT: s_lshr_b32 s14, s4, 28 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x4000c -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s4 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s10 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s12 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 ; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s13, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s14 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2082,51 +2086,51 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 +; GFX9-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s4, 0x40014 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s5, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s14, s15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s4, 15 +; GFX9-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s16, s17 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s6, v0 +; GFX9-NEXT: s_and_b32 s11, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s9, s10 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s6, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2137,96 +2141,97 @@ ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s0, v3 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s0, s4, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s1, v4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s4, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s4, 28 +; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s12, s12, s13 +; GFX9-DL-NEXT: s_bfe_u32 s14, s4, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s4, 0x40014 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s5, v5 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s0, v6 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s5, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s14, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s4, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s4, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s4, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s4, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s6, s7, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s16, s17 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s6, v0 +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s6, s9, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s6, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v6 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s0, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s1, s6 -; GFX10-DL-NEXT: s_bfe_u32 s1, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s5, s4, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s7, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s8, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s8 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s7, s8 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s4, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s1, s5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s4 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s5 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 @@ -2548,64 +2553,65 @@ ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s6, s4, 15 -; GFX10-DL-NEXT: s_and_b32 s8, s5, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s5, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s0, s1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s1, s5, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s7, s9 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s4 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s7 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s0, s1 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s0, s5, 0x40014 -; GFX10-DL-NEXT: s_lshr_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s0 -; GFX10-DL-NEXT: s_bfe_u32 s8, s5, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s5, 0x40018 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s7 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s1, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s7, s9 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, s0, s4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v4 -; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX10-DL-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v11, v5, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v11 +; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 +; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v5 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v7 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2696,47 +2702,47 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] +; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s4, 15 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s5 -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 -; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s11 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX8-NEXT: s_lshr_b32 s4, s4, 28 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2747,47 +2753,47 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s4, 15 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -2798,47 +2804,47 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 15 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 -; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 -; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2846,44 +2852,45 @@ ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s0, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s1, s4, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s4, 0x40008 +; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s4, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s8, s0 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s9, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 -; GFX10-DL-NEXT: s_bfe_u32 s0, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s1, s4, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -2973,42 +2980,42 @@ ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s0, s2, 15 -; GFX8-NEXT: s_and_b32 s1, s3, 15 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40018 +; GFX8-NEXT: s_and_b32 s5, s2, 15 +; GFX8-NEXT: s_and_b32 s6, s3, 15 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s17, s2, 0x40018 ; GFX8-NEXT: s_lshr_b32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v2, v3 -; GFX8-NEXT: s_bfe_u32 s6, s3, 0x40004 -; GFX8-NEXT: s_bfe_u32 s8, s3, 0x40008 -; GFX8-NEXT: s_bfe_u32 s10, s3, 0x4000c -; GFX8-NEXT: s_bfe_u32 s12, s3, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s3, 0x40014 -; GFX8-NEXT: s_bfe_u32 s16, s3, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX8-NEXT: s_bfe_u32 s8, s3, 0x40004 +; GFX8-NEXT: s_bfe_u32 s10, s3, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s3, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX8-NEXT: s_bfe_u32 s16, s3, 0x40014 +; GFX8-NEXT: s_bfe_u32 s18, s3, 0x40018 ; GFX8-NEXT: s_lshr_b32 s3, s3, 28 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NEXT: v_mad_u32_u24 v2, s14, v3, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NEXT: v_mad_u32_u24 v2, s16, v3, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: v_mad_u32_u24 v0, s16, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mad_u32_u24 v2, s18, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -3020,42 +3027,42 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s0, s2, 15 -; GFX9-NEXT: s_and_b32 s1, s3, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40018 +; GFX9-NEXT: s_and_b32 s5, s2, 15 +; GFX9-NEXT: s_and_b32 s6, s3, 15 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40004 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s13, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s17, s2, 0x40018 ; GFX9-NEXT: s_lshr_b32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v2, v3 -; GFX9-NEXT: s_bfe_u32 s6, s3, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s3, 0x40008 -; GFX9-NEXT: s_bfe_u32 s10, s3, 0x4000c -; GFX9-NEXT: s_bfe_u32 s12, s3, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mad_u32_u24 v0, s6, v0, v1 +; GFX9-NEXT: s_bfe_u32 s8, s3, 0x40004 +; GFX9-NEXT: s_bfe_u32 s10, s3, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s3, 0x4000c +; GFX9-NEXT: s_bfe_u32 s14, s3, 0x40010 +; GFX9-NEXT: s_bfe_u32 s16, s3, 0x40014 +; GFX9-NEXT: s_bfe_u32 s18, s3, 0x40018 ; GFX9-NEXT: s_lshr_b32 s3, s3, 28 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mad_u32_u24 v2, s3, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s13 -; GFX9-NEXT: v_mad_u32_u24 v2, s14, v3, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: v_mad_u32_u24 v2, s16, v3, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, s3, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mad_u32_u24 v0, s10, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: v_mad_u32_u24 v0, s16, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mad_u32_u24 v2, s18, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -3067,29 +3074,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s3, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s3, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s3, s2, v2 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -515,19 +515,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x3e7 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x3e7 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v3, s4, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0: @@ -538,14 +538,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0: @@ -556,14 +556,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -581,19 +581,19 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: @@ -605,15 +605,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: s_lshr_b32 s1, s4, 16 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, s1, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: @@ -625,15 +625,15 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: s_lshr_b32 s1, s4, 16 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, s1, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -652,18 +652,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v3, 53, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, 53, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -674,14 +674,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 53, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 53, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -692,14 +692,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 53, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 53, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -717,37 +717,37 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x3e7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3e70000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1: @@ -758,14 +758,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -786,32 +786,32 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, -15, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, -15, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xfff10000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: @@ -822,14 +822,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -846,19 +846,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0: @@ -869,14 +869,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0: @@ -887,14 +887,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -915,14 +915,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, 53 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 53 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -933,14 +933,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 53, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 53, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -951,14 +951,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 53, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 53, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -975,37 +975,37 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x4500 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x45000000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1: @@ -1016,14 +1016,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1044,32 +1044,32 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, 35, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, 35, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x230000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x230000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: @@ -1080,14 +1080,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1167,20 +1167,20 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, s0, v3, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1188,20 +1188,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 -; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 4 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s0, 0xffff, s2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, s0, v3, v4 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1209,20 +1209,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 -; CI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: s_lshl_b32 s2, s4, 4 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshl_b32 s0, 0xffff, s2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: s_lshl_b32 s0, s4, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, s0, v3, v4 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1239,80 +1239,80 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: s_mov_b32 s7, 0x12341234 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v2, v[2:3], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x12341234 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0x12341234 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; VI-NEXT: s_mov_b32 s0, 0x12341234 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, v2, s7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_bfi_b32 v0, v1, s0, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s6, 0x12341234 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v2, v[2:3] +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_mov_b32 s0, 0x12341234 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, v2, s6, v3 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_bfi_b32 v0, v1, s0, v0 +; CI-NEXT: flat_store_dword v[4:5], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1356,13 +1356,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, s1, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1376,13 +1376,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, s1, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1423,14 +1423,14 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1447,13 +1447,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, s1, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1498,13 +1498,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1518,13 +1518,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1565,14 +1565,14 @@ ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1589,13 +1589,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1640,13 +1640,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1660,13 +1660,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1686,53 +1686,52 @@ ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1 -; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0 +; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1 +; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 s5, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s6, s4 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s1, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s2, s4, s0 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_lshl_b32 s3, s2, 16 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] +; VI-NEXT: s_or_b32 s0, s2, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1744,26 +1743,26 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_load_dword v4, v[0:1] ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_load_dword s6, s[4:5], 0x4 -; CI-NEXT: s_mov_b32 s4, 0xffff +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 +; CI-NEXT: s_mov_b32 s6, 0xffff ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b32 s5, 0 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s2, s6, 16 -; CI-NEXT: s_and_b32 s3, s6, s4 +; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: s_and_b32 s3, s4, s6 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_or_b32 s1, s3, s2 +; CI-NEXT: s_or_b32 s0, s3, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_bfi_b32 v1, v5, s1, v1 -; CI-NEXT: v_bfi_b32 v0, v4, s1, v0 +; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 +; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1785,24 +1784,24 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX9-NEXT: s_lshl_b32 s2, s5, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0 +; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1811,20 +1810,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_and_b32 s2, s4, s6 -; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s4, s5, 4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_lshl_b32 s2, s5, 4 +; VI-NEXT: s_and_b32 s3, s4, s0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: s_lshl_b32 s2, s3, 16 +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1839,20 +1838,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; CI-NEXT: s_mov_b32 s6, 0xffff -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_and_b32 s2, s4, s6 -; CI-NEXT: s_lshl_b32 s3, s4, 16 -; CI-NEXT: s_or_b32 s2, s2, s3 -; CI-NEXT: s_lshl_b32 s4, s5, 4 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_and_b32 s2, s4, s0 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_or_b32 s2, s2, s4 +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_lshl_b32 s3, s5, 4 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -4,9 +4,8 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7 - -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] -; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]] +; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -2,8 +2,8 @@ ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: -; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] +; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]] ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -21,10 +21,10 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -33,10 +33,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -62,8 +62,8 @@ ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -73,8 +73,8 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -123,14 +123,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -145,14 +145,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -191,13 +191,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: @@ -208,13 +208,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -251,13 +251,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: @@ -268,13 +268,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -318,14 +318,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -340,14 +340,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -394,14 +394,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -416,14 +416,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -470,14 +470,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -492,14 +492,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -547,14 +547,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: @@ -569,14 +569,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll @@ -149,7 +149,7 @@ ; NOLOOP: s_mov_b32 m0, 0{{$}} ; NOLOOP: ds_gws_barrier v{{[0-9]+}} offset:7 gds ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; NOLOOP-NEXT: load_dword +; NOLOOP: load_dword define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 { call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7) %load = load volatile i32, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll @@ -304,7 +304,7 @@ ; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1 ; GCN-NEXT: v_mov_b32_e32 ; GCN-NEXT: v_mov_b32_e32 -; GCN-NEXT: {{global|flat|buffer}}_store_dwordx2 +; GCN: {{global|flat|buffer}}_store_dwordx2 define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) { %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -9,11 +9,11 @@ ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -30,11 +30,11 @@ ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier @@ -51,45 +51,45 @@ ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0 -; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT2-NEXT: v_mov_b32_e32 v2, s3 +; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] +; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; VARIANT2-NEXT: global_store_dword v[1:2], v0, off +; VARIANT2-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off -; VARIANT2-NEXT: v_mov_b32_e32 v5, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier -; VARIANT2-NEXT: global_load_dword v0, v[0:1], off +; VARIANT2-NEXT: global_load_dword v0, v[3:4], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off +; VARIANT2-NEXT: global_store_dword v[1:2], v0, off ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0 -; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT3-NEXT: v_mov_b32_e32 v2, s3 +; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] +; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; VARIANT3-NEXT: global_store_dword v[1:2], v0, off +; VARIANT3-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off -; VARIANT3-NEXT: v_mov_b32_e32 v5, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT3-NEXT: s_barrier -; VARIANT3-NEXT: global_load_dword v0, v[0:1], off +; VARIANT3-NEXT: global_load_dword v0, v[3:4], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off +; VARIANT3-NEXT: global_store_dword v[1:2], v0, off ; VARIANT3-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1537,8 +1537,8 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -5,11 +5,11 @@ ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v2, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 -; GFX900-NEXT: ds_write_b16 v3, v2 +; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GFX900-DAG: s_waitcnt lgkmcnt(0) +; GFX900-DAG: v_mov_b32_e32 v1, v2 +; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16 +; GFX900: ds_write_b16 [[ZERO]], v2 ; GFX900-NEXT: s_waitcnt lgkmcnt(1) ; GFX900-NEXT: v_mov_b32_e32 v0, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) @@ -27,14 +27,13 @@ ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: ds_read_u16 v1, v0 -; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(1) -; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 +; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16 +; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]] +; GFX900-DAG: s_waitcnt lgkmcnt(0) +; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] +; GFX900: v_lshl_or_b32 [[HI]], [[HI]], 16, [[AND]] ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -518,12 +518,12 @@ ; GFX900-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v3, v0 -; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX900-NEXT: ds_write_b16 v2, v0 +; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -531,12 +531,12 @@ ; GFX906-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_mov_b32_e32 v2, 0 ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: ds_write_b16 v3, v0 -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: ds_write_b16 v2, v0 +; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -580,10 +580,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: ds_write_b16 v3, v2 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX906-NEXT: v_mov_b32_e32 v4, 0 -; GFX906-NEXT: ds_write_b16 v4, v3 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -594,13 +594,13 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: ds_write_b16 v3, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_mov_b32_e32 v2, 0 +; GFX803-NEXT: ds_write_b16 v2, v1 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -618,12 +618,12 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX900-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: ds_write_b16 v3, v5 -; GFX900-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX900-NEXT: ds_write_b16 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -632,12 +632,12 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX906-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: ds_write_b16 v3, v5 -; GFX906-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX906-NEXT: ds_write_b16 v3, v4 +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -648,11 +648,11 @@ ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: ds_write_b16 v3, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -994,13 +994,13 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ushort v0, v[0:1] -; FIXME: and should be removable ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] +; FIXME: and should be removable entry: %reg.bc = bitcast i32 %reg to <2 x half> %load = load half, half* %in @@ -1034,10 +1034,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1116,10 +1116,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1523,11 +1523,11 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1605,11 +1605,11 @@ ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -112,22 +112,22 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1 +; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, 0xfffff9, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -10,10 +10,10 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -72,51 +72,51 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -127,7 +127,7 @@ ; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -151,13 +151,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: @@ -169,39 +169,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v2, s0, v3 -; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s8, 0xffff ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 +; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3 -; CI-NEXT: v_bfe_u32 v2, v2, s8, 16 +; CI-NEXT: v_bfe_u32 v2, v2, s10, 16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -226,13 +226,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: @@ -244,39 +244,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s0 -; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s8, 0xffff ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 +; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshr_b32_e32 v3, s9, v3 -; CI-NEXT: v_bfe_u32 v2, s8, v2, 16 +; CI-NEXT: v_bfe_u32 v2, s10, v2, 16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -300,46 +300,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 8 +; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v2, v4, 8 -; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -347,7 +347,7 @@ ; CI-NEXT: v_bfe_u32 v2, 8, v2, 16 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -368,13 +368,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v2i16: @@ -385,32 +385,32 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -426,60 +426,60 @@ ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1 +; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 @@ -498,7 +498,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -555,22 +555,22 @@ ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s8, 0xff00ff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -223,12 +223,12 @@ ; FIXME: Should be able to use mixlo/mixhi ; GCN-LABEL: {{^}}v_mad_mix_v2f32_clamp_precvt: -; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9: v_cvt_f16_f32_e32 v1, v3 +; GFX9: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp ; GFX9: v_cvt_f16_f32_e32 v0, v0 -; GFX9: v_and_b32_e32 v1, 0xffff, v1 -; GFX9: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9: v_cvt_f16_f32_e32 v1, v3 +; GFX9: v_and_b32_e32 v0, 0xffff, v0 +; GFX9: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9: s_setpc_b64 define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -101,7 +101,7 @@ ; CI: v_bfe_i32 v[[B1:[0-9]+]], v1, 0, 31 ; CI: v_ashr_i64 ; CI: v_bfe_i32 v[[B2:[0-9]+]], v0, 0, 31 -; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v[1:2] +; CI: v_mad_i64_i32 v[0:1], s{{\[[0-9]+:[0-9]+\]}}, v[[B2]], v[[B1]], v{{\[[0-9]+:[0-9]+\]}} define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { %sext0 = sext i31 %arg0 to i63 %sext1 = sext i31 %arg1 to i63 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -17,13 +17,13 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_i16: @@ -39,13 +39,13 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_i16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -74,15 +74,15 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v3, v2 -; VI-NEXT: v_max_i16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v1, v0, v2 +; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v2i16: @@ -98,13 +98,13 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid @@ -124,35 +124,35 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v8, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v8 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v8 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v2 -; VI-NEXT: flat_load_dword v9, v[0:1] -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v4, v[4:5] -; VI-NEXT: flat_load_dword v5, v[2:3] -; VI-NEXT: flat_load_ushort v6, v[6:7] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v8 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v2 +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_ushort v3, v[8:9] +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[6:7] +; VI-NEXT: v_add_u32_e32 v10, vcc, 4, v4 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_max_i16_e32 v7, v5, v9 -; VI-NEXT: v_max_i16_sdwa v5, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_max_i16_e32 v6, v0, v2 +; VI-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v4, v6, v4 -; VI-NEXT: v_or_b32_e32 v5, v7, v5 -; VI-NEXT: flat_store_dword v[0:1], v5 -; VI-NEXT: flat_store_short v[2:3], v4 +; VI-NEXT: v_max_i16_e32 v1, v1, v3 +; VI-NEXT: v_or_b32_e32 v0, v6, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 +; VI-NEXT: flat_store_short v[10:11], v1 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: @@ -272,13 +272,13 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sgt_i16: @@ -294,13 +294,13 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_i16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -329,13 +329,13 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_uge_i16: @@ -351,13 +351,13 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_u16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -386,13 +386,13 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v3, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v0, v0, v2 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_i16: @@ -408,13 +408,13 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v3, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_u16_e32 v0, v0, v2 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -442,15 +442,15 @@ ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v4, v3, v2 -; VI-NEXT: v_max_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v4, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v1, v0, v2 +; VI-NEXT: v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: @@ -466,13 +466,13 @@ ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v2, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_u16 v0, v0, v2 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll --- a/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll +++ b/llvm/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -21,8 +21,8 @@ } ; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs: -; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}], -; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 { main_body: %zcompare.1 = fadd float %zcompare, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -255,10 +255,10 @@ ; GCN-LABEL: {{^}}s_sad_u32_i8_pat2: ; GCN: s_load_dword ; GCN: s_bfe_u32 -; GCN: s_sub_i32 -; GCN: s_and_b32 -; GCN: s_sub_i32 -; GCN: s_lshr_b32 +; GCN-DAG: s_sub_i32 +; GCN-DAG: s_and_b32 +; GCN-DAG: s_sub_i32 +; GCN-DAG: s_lshr_b32 ; GCN: v_add_i32_e32 define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) { %icmp0 = icmp ugt i8 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -25,34 +25,34 @@ ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec - ; CHECK: undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec - ; CHECK: %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec + ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec + ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec + ; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec ; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec - ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec ; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec - ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec - ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, 0, implicit $exec - ; CHECK: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]] - ; CHECK: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec + ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF1]], implicit $exec + ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec + ; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]] + ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: S_SETREG_IMM32_B32 0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir @@ -19,10 +19,10 @@ ; GCN-LABEL: name: handleMove_bundle ; GCN: liveins: $sgpr4_sgpr5 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN: $vcc_hi = IMPLICIT_DEF - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vcc_hi = IMPLICIT_DEF ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3) ; GCN: $m0 = S_MOV_B32 0 diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll @@ -181,8 +181,8 @@ } ; GCN-LABEL: {{^}}add_select_negliteralk_fabs_f32: -; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: buffer_load_dword [[Y:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xc4800000 ; GCN-DAG: v_cmp_ne_u32_e64 [[VCC:.*]], s{{[0-9]+}}, 0 @@ -367,9 +367,9 @@ } ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32: -; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: buffer_load_dword [[Y:v[0-9]+]] -; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]] @@ -385,9 +385,9 @@ } ; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32: -; GCN: buffer_load_dword [[X:v[0-9]+]] -; GCN: buffer_load_dword [[Y:v[0-9]+]] -; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 +; GCN-DAG: buffer_load_dword [[X:v[0-9]+]] +; GCN-DAG: buffer_load_dword [[Y:v[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983 ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -73,51 +73,51 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -128,7 +128,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -152,13 +152,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: @@ -170,39 +170,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s8, 0xffff ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 +; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s9, 0xffff -; CI-NEXT: s_lshr_b32 s10, s8, 16 -; CI-NEXT: s_and_b32 s8, s8, s9 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 -; CI-NEXT: v_and_b32_e32 v2, s9, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -227,13 +227,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: @@ -245,17 +245,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_s_v_v2i16: @@ -270,12 +270,12 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_lshr_b32 s9, s8, 16 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v3, s0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshl_b32_e32 v2, s9, v2 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 @@ -301,56 +301,56 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 8 +; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b32 s4, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, s4, v2 +; CI-NEXT: v_and_b32_e32 v3, s0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 ; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_and_b32_e32 v3, s4, v3 +; CI-NEXT: v_and_b32_e32 v3, s0, v3 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -371,13 +371,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v2i16: @@ -388,33 +388,33 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -430,60 +430,60 @@ ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1 +; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -502,7 +502,7 @@ ; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -539,22 +539,22 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_mov_b32 s4, 0xff000000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_mov_b32 s0, 0xff000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s0, v0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_and_b32_e32 v4, s4, v4 +; VI-NEXT: v_and_b32_e32 v4, s0, v4 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -562,16 +562,16 @@ ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s8, 0xff00 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 @@ -581,7 +581,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -31,13 +31,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64: @@ -48,13 +48,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -166,13 +166,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_64_sub_x: @@ -183,13 +183,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -226,13 +226,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_65: @@ -243,13 +243,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffbf, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -286,13 +286,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_65_sub_x: @@ -303,13 +303,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 0x41, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -346,13 +346,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg16: @@ -363,13 +363,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -406,13 +406,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: @@ -423,13 +423,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, -16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -466,13 +466,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg17: @@ -483,13 +483,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 17, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 17, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -526,13 +526,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: @@ -543,13 +543,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 0xffffffef, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -621,13 +621,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64: @@ -638,13 +638,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -756,20 +756,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 64 +; VI-NEXT: v_mov_b32_e32 v4, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffffc0, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: @@ -780,13 +780,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -823,38 +823,38 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 64 +; VI-NEXT: v_mov_b32_e32 v4, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, -7, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, -7, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x400007 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x400007 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -891,38 +891,38 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffff85 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffc0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffffc0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x7b0040 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x7b0040 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -963,15 +963,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, -7, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -7, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: @@ -982,13 +982,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 7 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1022,19 +1022,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, -16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: @@ -1045,13 +1045,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1084,19 +1084,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0: @@ -1107,13 +1107,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1146,19 +1146,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0: @@ -1169,13 +1169,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1213,20 +1213,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 +; VI-NEXT: v_mov_b32_e32 v4, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffffe0, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffffe0, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: @@ -1237,13 +1237,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1276,19 +1276,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: @@ -1299,13 +1299,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1345,15 +1345,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, 0xffffffe0, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: @@ -1364,13 +1364,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1408,20 +1408,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 +; VI-NEXT: v_mov_b32_e32 v4, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, -16, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, -16, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: @@ -1432,13 +1432,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1471,19 +1471,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, -16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: @@ -1494,13 +1494,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1540,15 +1540,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, -16, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: @@ -1559,13 +1559,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1602,20 +1602,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffc400 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffc400, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -1626,13 +1626,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1669,20 +1669,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x4400 +; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 4.0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 4.0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -1693,13 +1693,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -1.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1736,20 +1736,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 2.0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 2.0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: @@ -1760,13 +1760,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1803,20 +1803,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffc000 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffc000, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: @@ -1827,13 +1827,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1867,18 +1867,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: @@ -1889,13 +1889,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1933,13 +1933,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 32, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef: @@ -1950,13 +1950,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -267,9 +267,9 @@ ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}} ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 +; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -294,9 +294,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -309,9 +309,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -32,12 +32,12 @@ ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_sub_u16 [[ADD:v[0-9]+]], [[MAX]], -2 op_sel_hi:[1,0] -; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 -; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, -; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; VI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI-DAG: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} ; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[TWO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_and_b32 diff --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll --- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll @@ -3,8 +3,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s ; GCN-FUNC: {{^}}vccz_workaround: -; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0 -; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, s{{[0-9]+}}, 0{{$}} +; GCN: s_load_dword [[REG:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], +; GCN: v_cmp_neq_f32_e64 {{[^,]*}}, [[REG]], 0{{$}} ; VCCZ-BUG: s_waitcnt lgkmcnt(0) ; VCCZ-BUG: s_mov_b64 vcc, vcc ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -8,10 +8,10 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; @@ -103,9 +103,9 @@ ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: ds_write_b16 v0, v2 offset:4 ; HAWAII-NEXT: ds_write_b32 v0, v1 ; HAWAII-NEXT: s_endpgm ; @@ -117,9 +117,9 @@ ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v0, v2 offset:4 ; FIJI-NEXT: ds_write_b32 v0, v1 ; FIJI-NEXT: s_endpgm ; @@ -130,10 +130,10 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v2 ; GFX9-NEXT: s_endpgm store i48 %arg, i48 addrspace(3)* %ptr, align 8 ret void @@ -148,11 +148,11 @@ ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: s_and_b32 s0, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s0 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 ; HAWAII-NEXT: ds_write_b64 v2, v[0:1] ; HAWAII-NEXT: s_endpgm ; @@ -164,11 +164,11 @@ ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: s_and_b32 s0, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s0 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 ; FIJI-NEXT: ds_write_b64 v2, v[0:1] ; FIJI-NEXT: s_endpgm ; @@ -180,9 +180,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm @@ -215,10 +215,10 @@ ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; CIVI-NEXT: s_mov_b32 m0, -1 ; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -7,7 +7,7 @@ ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -15,8 +15,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -30,7 +30,7 @@ ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -38,8 +38,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] @@ -166,42 +166,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -216,42 +216,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3df -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -265,41 +265,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 1 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 1, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -313,40 +313,40 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_add_u16_e32 v0, 0xffffffe0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -361,41 +361,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 1.0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -411,7 +411,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -419,8 +419,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -436,7 +436,7 @@ ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -444,8 +444,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] @@ -473,7 +473,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -481,14 +481,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v[4:5], off ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 @@ -500,26 +500,26 @@ ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v4, v[4:5] -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v2, v4 -; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v2 +; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -539,7 +539,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -547,8 +547,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -564,7 +564,7 @@ ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -572,8 +572,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] @@ -603,7 +603,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -611,8 +611,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off @@ -631,7 +631,7 @@ ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -639,8 +639,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v2, v[2:3] ; VI-NEXT: flat_load_dword v0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -106,13 +106,13 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[4:5], 0x0 -; VI-NEXT: s_load_dword s3, s[6:7], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_load_dword s0, s[4:5], 0x0 +; VI-NEXT: s_load_dword s1, s[6:7], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_sext_i32_i16 s0, s2 -; VI-NEXT: s_sext_i32_i16 s1, s3 +; VI-NEXT: s_sext_i32_i16 s0, s0 +; VI-NEXT: s_sext_i32_i16 s1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2 ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -116,9 +116,9 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -134,11 +134,11 @@ ; GFX9-LABEL: shuffle_v4f16_357u: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -424,15 +424,15 @@ ; GFX9-LABEL: shuffle_v4f16_3456: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -446,11 +446,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v3, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -464,16 +464,16 @@ ; GFX9-LABEL: shuffle_v4f16_5734: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v2, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_and_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -807,10 +807,10 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll --- a/llvm/test/CodeGen/AMDGPU/wait.ll +++ b/llvm/test/CodeGen/AMDGPU/wait.ll @@ -13,7 +13,7 @@ ; DEFAULT: buffer_load_format_xyzw ; DEFAULT: s_waitcnt vmcnt(0) ; DEFAULT: exp -; DEFAULT-NEXT: exp +; DEFAULT: exp ; DEFAULT-NEXT: s_endpgm define amdgpu_vs void @main(<16 x i8> addrspace(4)* inreg %arg, <16 x i8> addrspace(4)* inreg %arg1, <32 x i8> addrspace(4)* inreg %arg2, <16 x i8> addrspace(4)* inreg %arg3, <16 x i8> addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(4)* inreg %constptr) #0 { main_body: diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -516,14 +516,14 @@ } ; GCN-LABEL: {{^}}test_preserve_condition_undef_flag: -; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 -; GFX1032: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 +; GFX1032-DAG: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 +; GFX1032-DAG: v_cmp_ngt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 0 ; GFX1032: v_cmp_nlt_f32_e64 s{{[0-9]+}}, s{{[0-9]+}}, 1.0 ; GFX1032: s_or_b32 [[OR1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}} ; GFX1032: s_or_b32 [[OR2:s[0-9]+]], [[OR1]], s{{[0-9]+}} ; GFX1032: s_and_b32 vcc_lo, exec_lo, [[OR2]] -; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 -; GFX1064: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 +; GFX1064-DAG: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 +; GFX1064-DAG: v_cmp_ngt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 0 ; GFX1064: v_cmp_nlt_f32_e64 s[{{[0-9:]+}}], s{{[0-9]+}}, 1.0 ; GFX1064: s_or_b64 [[OR1:s\[[0-9:]+\]]], s[{{[0-9:]+}}], s[{{[0-9:]+}}] ; GFX1064: s_or_b64 [[OR2:s\[[0-9:]+\]]], [[OR1]], s[{{[0-9:]+}}] diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -272,17 +272,17 @@ ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v2 -; VI-NEXT: v_or_b32_e32 v2, 4, v2 +; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0 +; VI-NEXT: v_or_b32_e32 v2, 4, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -155,12 +155,12 @@ ; ; GFX10-LABEL: xor3_uniform_vgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0 -; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0 -; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4 +; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %a1 = fadd float %a, 1.0 %b2 = fadd float %b, 2.0