diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -233,33 +233,11 @@ // Pick best from BotCand and TopCand. LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand); dbgs() << "Bot Cand: "; traceCandidate(BotCand);); - SchedCandidate Cand; - if (TopCand.Reason == BotCand.Reason) { - Cand = BotCand; - GenericSchedulerBase::CandReason TopReason = TopCand.Reason; - TopCand.Reason = NoCand; - GenericScheduler::tryCandidate(Cand, TopCand, nullptr); - if (TopCand.Reason != NoCand) { - Cand.setBest(TopCand); - } else { - TopCand.Reason = TopReason; - } - } else { - if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) { - Cand = BotCand; - } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = TopCand; - } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) { - Cand = BotCand; - } else { - if (BotCand.Reason > TopCand.Reason) { - Cand = TopCand; - } else { - Cand = BotCand; - } - } + SchedCandidate Cand = BotCand; + TopCand.Reason = NoCand; + GenericScheduler::tryCandidate(Cand, TopCand, nullptr); + if (TopCand.Reason != NoCand) { + Cand.setBest(TopCand); } LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand);); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -374,10 +374,10 @@ ; GFX7-LABEL: v_bswap_i16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bswap_i16: @@ -440,10 +440,10 @@ ; GFX7-LABEL: v_bswap_i16_zext_to_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -469,10 +469,10 @@ ; GFX7-LABEL: v_bswap_i16_sext_to_i32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -126,21 +126,21 @@ ; GFX8-LABEL: v_pow_v2f16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_log_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX8-NEXT: v_log_f16_e32 v2, v0 +; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v1, v1 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16: @@ -154,11 +154,11 @@ ; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX9-NEXT: v_exp_f16_e32 v1, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX9-NEXT: v_exp_f16_e32 v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) @@ -173,40 +173,40 @@ ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_log_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3 -; GFX6-NEXT: v_exp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3 +; GFX6-NEXT: v_exp_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_pow_v2f16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_log_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX8-NEXT: v_log_f16_e32 v2, v0 +; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 -; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: v_exp_f16_e32 v1, v1 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs: @@ -259,22 +259,22 @@ ; GFX8-LABEL: v_pow_v2f16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_log_f16_e32 v0, v0 +; GFX8-NEXT: v_log_f16_e32 v2, v0 +; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_exp_f16_e32 v1, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_rhs: @@ -336,22 +336,22 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0x80008000 ; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_log_f16_e32 v0, v0 +; GFX8-NEXT: v_log_f16_e32 v2, v0 +; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 -; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: v_exp_f16_e32 v1, v2 +; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3 +; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 ; GFX8-NEXT: v_exp_f16_e32 v0, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_exp_f16_e32 v1, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -233,34 +233,41 @@ ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: v_mov_b32_e32 v16, s7 -; MOVREL-NEXT: v_mov_b32_e32 v8, v0 ; MOVREL-NEXT: v_mov_b32_e32 v14, s5 +; MOVREL-NEXT: v_mov_b32_e32 v12, s3 ; MOVREL-NEXT: v_mov_b32_e32 v13, s4 ; MOVREL-NEXT: v_mov_b32_e32 v15, s6 -; MOVREL-NEXT: v_mov_b32_e32 v12, s3 ; MOVREL-NEXT: v_mov_b32_e32 v11, s2 ; MOVREL-NEXT: v_mov_b32_e32 v10, s1 ; MOVREL-NEXT: v_mov_b32_e32 v9, s0 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v8 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 +; MOVREL-NEXT: v_mov_b32_e32 v1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v2, v10 +; MOVREL-NEXT: v_mov_b32_e32 v3, v11 +; MOVREL-NEXT: v_mov_b32_e32 v4, v12 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0 ; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 -; MOVREL-NEXT: v_mov_b32_e32 v7, v16 -; MOVREL-NEXT: v_movreld_b32_e32 v0, s10 +; MOVREL-NEXT: v_mov_b32_e32 v5, v13 +; MOVREL-NEXT: v_mov_b32_e32 v6, v14 +; MOVREL-NEXT: v_mov_b32_e32 v7, v15 +; MOVREL-NEXT: v_mov_b32_e32 v8, v16 +; MOVREL-NEXT: v_movreld_b32_e32 v1, s10 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB3_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v1, v2 +; MOVREL-NEXT: v_mov_b32_e32 v2, v3 +; MOVREL-NEXT: v_mov_b32_e32 v3, v4 +; MOVREL-NEXT: v_mov_b32_e32 v4, v5 +; MOVREL-NEXT: v_mov_b32_e32 v5, v6 +; MOVREL-NEXT: v_mov_b32_e32 v6, v7 +; MOVREL-NEXT: v_mov_b32_e32 v7, v8 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -393,35 +400,41 @@ ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: v_mov_b32_e32 v17, s7 -; MOVREL-NEXT: v_mov_b32_e32 v8, v0 -; MOVREL-NEXT: v_mov_b32_e32 v9, v1 ; MOVREL-NEXT: v_mov_b32_e32 v15, s5 -; MOVREL-NEXT: v_mov_b32_e32 v16, s6 -; MOVREL-NEXT: v_mov_b32_e32 v14, s4 ; MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; MOVREL-NEXT: v_mov_b32_e32 v14, s4 +; MOVREL-NEXT: v_mov_b32_e32 v16, s6 ; MOVREL-NEXT: v_mov_b32_e32 v12, s2 ; MOVREL-NEXT: v_mov_b32_e32 v11, s1 ; MOVREL-NEXT: v_mov_b32_e32 v10, s0 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 -; MOVREL-NEXT: v_mov_b32_e32 v0, v10 -; MOVREL-NEXT: v_mov_b32_e32 v1, v11 -; MOVREL-NEXT: v_mov_b32_e32 v2, v12 -; MOVREL-NEXT: v_mov_b32_e32 v3, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 +; MOVREL-NEXT: v_mov_b32_e32 v2, v10 +; MOVREL-NEXT: v_mov_b32_e32 v3, v11 +; MOVREL-NEXT: v_mov_b32_e32 v4, v12 +; MOVREL-NEXT: v_mov_b32_e32 v5, v13 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 ; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v4, v14 -; MOVREL-NEXT: v_mov_b32_e32 v5, v15 -; MOVREL-NEXT: v_mov_b32_e32 v6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v7, v17 -; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: v_mov_b32_e32 v6, v14 +; MOVREL-NEXT: v_mov_b32_e32 v7, v15 +; MOVREL-NEXT: v_mov_b32_e32 v8, v16 +; MOVREL-NEXT: v_mov_b32_e32 v9, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB6_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v2 +; MOVREL-NEXT: v_mov_b32_e32 v1, v3 +; MOVREL-NEXT: v_mov_b32_e32 v2, v4 +; MOVREL-NEXT: v_mov_b32_e32 v3, v5 +; MOVREL-NEXT: v_mov_b32_e32 v4, v6 +; MOVREL-NEXT: v_mov_b32_e32 v5, v7 +; MOVREL-NEXT: v_mov_b32_e32 v6, v8 +; MOVREL-NEXT: v_mov_b32_e32 v7, v9 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -10,9 +10,9 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CHECK-NEXT: s_add_u32 s2, 4, 4 -; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v2, v2 +; CHECK-NEXT: ds_read_b32 v2, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -371,21 +371,21 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: @@ -393,21 +393,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -448,7 +448,6 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -456,7 +455,8 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: @@ -464,7 +464,6 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -472,7 +471,8 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -678,21 +678,21 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: @@ -700,21 +700,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -755,7 +755,6 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -763,7 +762,8 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: @@ -771,7 +771,6 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -779,7 +778,8 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -997,20 +997,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1020,20 +1020,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1077,15 +1077,15 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; @@ -1094,15 +1094,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: @@ -1566,20 +1566,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1589,20 +1589,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1646,15 +1646,15 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; @@ -1663,15 +1663,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -373,21 +373,21 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: @@ -395,21 +395,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: @@ -417,21 +417,21 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_inc v4, v[2:3], v4, off glc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -448,7 +448,6 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -456,7 +455,8 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: @@ -464,7 +464,6 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -472,7 +471,8 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: @@ -480,7 +480,6 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -488,7 +487,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v4, off glc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -939,20 +939,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -962,20 +962,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -985,20 +985,20 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[4:5], v[2:3], off glc -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm @@ -1017,15 +1017,15 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; @@ -1034,15 +1034,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; @@ -1051,15 +1051,15 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1147,21 +1147,21 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2 -; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: @@ -1169,21 +1169,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: @@ -1191,21 +1191,21 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: flat_atomic_inc v4, v[2:3], v4 glc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id @@ -1222,7 +1222,6 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; CI-NEXT: v_mov_b32_e32 v4, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 @@ -1230,7 +1229,8 @@ ; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: @@ -1238,7 +1238,6 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -1246,7 +1245,8 @@ ; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: @@ -1254,7 +1254,6 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -1262,7 +1261,8 @@ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id @@ -1420,20 +1420,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s3 -; CI-NEXT: v_mov_b32_e32 v4, s2 -; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm @@ -1443,20 +1443,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1466,20 +1466,20 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm @@ -1498,15 +1498,15 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 -; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; @@ -1515,15 +1515,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; @@ -1532,15 +1532,15 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, 42 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -20,13 +20,13 @@ ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] -; GFX10-NEXT: s_load_dword s0, s[0:1], 0x2c ; encoding: [0x00,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa] +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] -; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11] ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00] ; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -136,19 +136,19 @@ ; GFX906-LABEL: v_sdot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s4 -; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, v1 +; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX906-NEXT: v_mov_b32_e32 v0, s5 +; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, v1 +; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX908-NEXT: v_mov_b32_e32 v0, s5 +; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b: @@ -168,19 +168,19 @@ ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s4 -; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, 8 +; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX906-NEXT: v_mov_b32_e32 v0, s5 +; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, 8 +; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX908-NEXT: v_mov_b32_e32 v0, s5 +; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -136,19 +136,19 @@ ; GFX906-LABEL: v_udot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s4 -; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, v1 +; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX906-NEXT: v_mov_b32_e32 v0, s5 +; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, v1 +; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX908-NEXT: v_mov_b32_e32 v0, s5 +; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b: @@ -168,19 +168,19 @@ ; GFX906-LABEL: v_udot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s4 -; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, 8 +; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX906-NEXT: v_mov_b32_e32 v0, s5 +; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, 8 +; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 +; GFX908-NEXT: v_mov_b32_e32 v0, s5 +; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b_c: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -8,26 +8,26 @@ ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s3 +; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -891,24 +891,24 @@ ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: s_add_u32 s0, s2, 0x3ffc +; GFX6-NEXT: s_add_u32 s4, s2, 0x3ffc ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; GFX6-NEXT: s_addc_u32 s1, s3, 0 -; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_addc_u32 s5, s3, 0 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: s_add_u32 s0, s2, 0x3ffc +; GFX7-NEXT: s_add_u32 s4, s2, 0x3ffc ; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 -; GFX7-NEXT: s_addc_u32 s1, s3, 0 -; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_addc_u32 s5, s3, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095 @@ -1067,32 +1067,34 @@ ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: v_mov_b32_e32 v2, 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v0, v2 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: v_mov_b32_e32 v2, 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset %result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/add3.ll b/llvm/test/CodeGen/AMDGPU/add3.ll --- a/llvm/test/CodeGen/AMDGPU/add3.ll +++ b/llvm/test/CodeGen/AMDGPU/add3.ll @@ -245,12 +245,12 @@ ; ; GFX10-LABEL: add3_uniform_vgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0 -; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0 -; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4 +; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %a1 = fadd float %a, 1.0 %b2 = fadd float %b, 2.0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=IR %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s @@ -290,11 +290,11 @@ ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0x392fa ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_mov_b32 s5, 0x30c30c31 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GCN-NEXT: v_mul_hi_i32 v0, v0, s5 +; GCN-NEXT: s_mov_b32 s4, 0x30c30c31 +; GCN-NEXT: v_mul_hi_i32 v0, v0, s4 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0 ; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -320,11 +320,11 @@ ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0xa410 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_mov_b32 s5, 0x30c30c31 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_mul_hi_i32 v0, v0, s5 +; GCN-NEXT: s_mov_b32 s4, 0x30c30c31 +; GCN-NEXT: v_mul_hi_i32 v0, v0, s4 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0 ; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -404,7 +404,6 @@ ; IR-LABEL: @select_mul_rhs_const_i32( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000 ; IR-NEXT: ret i32 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %op = mul i32 %select, 1000 ret i32 %op @@ -415,7 +414,6 @@ ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131 ; IR-NEXT: store i16 [[OP]], i16 addrspace(1)* undef ; IR-NEXT: ret void - ; GCN-LABEL: select_add_lhs_const_i16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s0, s[4:5], 0x0 @@ -427,7 +425,6 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm -; %select = select i1 %cond, i16 5, i16 8 %op = add i16 %select, 123 store i16 %op, i16 addrspace(1)* undef @@ -445,7 +442,6 @@ ; IR-LABEL: @select_add_trunc_select( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50 ; IR-NEXT: ret i16 [[OP]] -; %select = select i1 %cond, i32 5, i32 8 %trunc = trunc i32 %select to i16 %op = add i16 %trunc, 42 @@ -473,7 +469,6 @@ ; IR-LABEL: @select_add_zext_select( ; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50 ; IR-NEXT: ret i32 [[OP]] - ; GCN-LABEL: select_add_zext_select: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -105,18 +105,18 @@ ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 @@ -125,7 +125,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB0_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -275,17 +275,17 @@ ; ; GFX1064-LABEL: add_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s1, s0, s1 @@ -298,7 +298,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 @@ -1662,18 +1662,18 @@ ; ; GFX1064-LABEL: sub_i32_constant: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 @@ -1682,7 +1682,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB8_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -1834,17 +1834,17 @@ ; ; GFX1064-LABEL: sub_i32_uniform: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0 ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: ; implicit-def: $vgpr1 -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s1, s0, s1 @@ -1857,7 +1857,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -339,41 +339,41 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00ff -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s6, 0xcccccccc -; SI-NEXT: s_mov_b32 s8, 0x55555555 -; SI-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s6, 0xff00ff +; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s10, 0x33333333 +; SI-NEXT: s_mov_b32 s11, 0xcccccccc +; SI-NEXT: s_mov_b32 s0, 0x55555555 +; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8 ; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24 ; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 -; SI-NEXT: v_bfi_b32 v2, s0, v0, v2 -; SI-NEXT: v_bfi_b32 v4, s0, v1, v3 -; SI-NEXT: v_and_b32_e32 v1, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v4 -; SI-NEXT: v_and_b32_e32 v3, s2, v2 -; SI-NEXT: v_and_b32_e32 v2, s2, v4 +; SI-NEXT: v_bfi_b32 v2, s6, v0, v2 +; SI-NEXT: v_bfi_b32 v4, s6, v1, v3 +; SI-NEXT: v_and_b32_e32 v1, s8, v2 +; SI-NEXT: v_and_b32_e32 v0, s8, v4 +; SI-NEXT: v_and_b32_e32 v3, s9, v2 +; SI-NEXT: v_and_b32_e32 v2, s9, v4 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v3, s6, v3 -; SI-NEXT: v_and_b32_e32 v2, s6, v2 +; SI-NEXT: v_and_b32_e32 v1, s10, v3 +; SI-NEXT: v_and_b32_e32 v0, s10, v2 +; SI-NEXT: v_and_b32_e32 v3, s11, v3 +; SI-NEXT: v_and_b32_e32 v2, s11, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 -; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_or_b32_e32 v3, v3, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s8, v3 -; SI-NEXT: v_and_b32_e32 v0, s8, v2 -; SI-NEXT: v_and_b32_e32 v3, s9, v3 -; SI-NEXT: v_and_b32_e32 v2, s9, v2 +; SI-NEXT: v_and_b32_e32 v1, s0, v3 +; SI-NEXT: v_and_b32_e32 v0, s0, v2 +; SI-NEXT: v_and_b32_e32 v3, s1, v3 +; SI-NEXT: v_and_b32_e32 v2, s1, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 @@ -386,45 +386,45 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; FLAT-NEXT: s_mov_b32 s2, 0x10203 -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s6, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s6, 0x10203 +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s8, 0x55555555 -; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v2, 0, v0, s2 -; FLAT-NEXT: v_perm_b32 v4, 0, v1, s2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s6, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s6, v4 +; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6 +; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v4 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v4 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_mov_b32 s0, 0x55555555 ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] +; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa ; FLAT-NEXT: v_or_b32_e32 v3, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 -; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_or_b32_e32 v1, v3, v1 ; FLAT-NEXT: v_or_b32_e32 v0, v2, v0 ; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -600,13 +600,13 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s0, 0xff00ff -; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f -; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0 -; SI-NEXT: s_mov_b32 s3, 0x33333333 -; SI-NEXT: s_mov_b32 s8, 0xcccccccc -; SI-NEXT: s_mov_b32 s9, 0x55555555 -; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; SI-NEXT: s_mov_b32 s8, 0xff00ff +; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f +; SI-NEXT: s_mov_b32 s10, 0xf0f0f0f0 +; SI-NEXT: s_mov_b32 s11, 0x33333333 +; SI-NEXT: s_mov_b32 s12, 0xcccccccc +; SI-NEXT: s_mov_b32 s13, 0x55555555 +; SI-NEXT: s_mov_b32 s14, 0xaaaaaaaa ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8 @@ -617,18 +617,18 @@ ; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8 ; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24 ; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24 -; SI-NEXT: v_bfi_b32 v2, s0, v2, v4 -; SI-NEXT: v_bfi_b32 v4, s0, v3, v5 -; SI-NEXT: v_bfi_b32 v6, s0, v0, v6 -; SI-NEXT: v_bfi_b32 v8, s0, v1, v7 -; SI-NEXT: v_and_b32_e32 v1, s1, v2 -; SI-NEXT: v_and_b32_e32 v0, s1, v4 -; SI-NEXT: v_and_b32_e32 v3, s2, v2 -; SI-NEXT: v_and_b32_e32 v2, s2, v4 -; SI-NEXT: v_and_b32_e32 v5, s1, v6 -; SI-NEXT: v_and_b32_e32 v4, s1, v8 -; SI-NEXT: v_and_b32_e32 v7, s2, v6 -; SI-NEXT: v_and_b32_e32 v6, s2, v8 +; SI-NEXT: v_bfi_b32 v2, s8, v2, v4 +; SI-NEXT: v_bfi_b32 v4, s8, v3, v5 +; SI-NEXT: v_bfi_b32 v6, s8, v0, v6 +; SI-NEXT: v_bfi_b32 v8, s8, v1, v7 +; SI-NEXT: v_and_b32_e32 v1, s9, v2 +; SI-NEXT: v_and_b32_e32 v0, s9, v4 +; SI-NEXT: v_and_b32_e32 v3, s10, v2 +; SI-NEXT: v_and_b32_e32 v2, s10, v4 +; SI-NEXT: v_and_b32_e32 v5, s9, v6 +; SI-NEXT: v_and_b32_e32 v4, s9, v8 +; SI-NEXT: v_and_b32_e32 v7, s10, v6 +; SI-NEXT: v_and_b32_e32 v6, s10, v8 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4 @@ -637,14 +637,14 @@ ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v7, v7, v5 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s3, v3 -; SI-NEXT: v_and_b32_e32 v0, s3, v2 -; SI-NEXT: v_and_b32_e32 v5, s3, v7 -; SI-NEXT: v_and_b32_e32 v4, s3, v6 -; SI-NEXT: v_and_b32_e32 v3, s8, v3 -; SI-NEXT: v_and_b32_e32 v2, s8, v2 -; SI-NEXT: v_and_b32_e32 v7, s8, v7 -; SI-NEXT: v_and_b32_e32 v6, s8, v6 +; SI-NEXT: v_and_b32_e32 v1, s11, v3 +; SI-NEXT: v_and_b32_e32 v0, s11, v2 +; SI-NEXT: v_and_b32_e32 v5, s11, v7 +; SI-NEXT: v_and_b32_e32 v4, s11, v6 +; SI-NEXT: v_and_b32_e32 v3, s12, v3 +; SI-NEXT: v_and_b32_e32 v2, s12, v2 +; SI-NEXT: v_and_b32_e32 v7, s12, v7 +; SI-NEXT: v_and_b32_e32 v6, s12, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2 @@ -653,14 +653,14 @@ ; SI-NEXT: v_or_b32_e32 v2, v2, v0 ; SI-NEXT: v_or_b32_e32 v7, v7, v5 ; SI-NEXT: v_or_b32_e32 v6, v6, v4 -; SI-NEXT: v_and_b32_e32 v1, s9, v3 -; SI-NEXT: v_and_b32_e32 v0, s9, v2 -; SI-NEXT: v_and_b32_e32 v5, s9, v7 -; SI-NEXT: v_and_b32_e32 v4, s9, v6 -; SI-NEXT: v_and_b32_e32 v3, s10, v3 -; SI-NEXT: v_and_b32_e32 v2, s10, v2 -; SI-NEXT: v_and_b32_e32 v7, s10, v7 -; SI-NEXT: v_and_b32_e32 v6, s10, v6 +; SI-NEXT: v_and_b32_e32 v1, s13, v3 +; SI-NEXT: v_and_b32_e32 v0, s13, v2 +; SI-NEXT: v_and_b32_e32 v5, s13, v7 +; SI-NEXT: v_and_b32_e32 v4, s13, v6 +; SI-NEXT: v_and_b32_e32 v3, s14, v3 +; SI-NEXT: v_and_b32_e32 v2, s14, v2 +; SI-NEXT: v_and_b32_e32 v7, s14, v7 +; SI-NEXT: v_and_b32_e32 v6, s14, v6 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 ; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 ; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 @@ -677,33 +677,33 @@ ; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; FLAT-NEXT: s_mov_b32 s2, 0x10203 -; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f -; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s10, 0x10203 +; FLAT-NEXT: s_mov_b32 s2, 0x33333333 +; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: v_mov_b32_e32 v1, s1 ; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s0, 0x33333333 -; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc -; FLAT-NEXT: s_mov_b32 s9, 0x55555555 -; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa +; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f +; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0 +; FLAT-NEXT: s_mov_b32 s8, 0x55555555 +; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa ; FLAT-NEXT: s_mov_b32 s7, 0xf000 ; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; FLAT-NEXT: v_perm_b32 v6, 0, v0, s2 -; FLAT-NEXT: v_perm_b32 v4, 0, v3, s2 -; FLAT-NEXT: v_perm_b32 v2, 0, v2, s2 -; FLAT-NEXT: v_perm_b32 v8, 0, v1, s2 -; FLAT-NEXT: v_and_b32_e32 v1, s3, v2 -; FLAT-NEXT: v_and_b32_e32 v0, s3, v4 -; FLAT-NEXT: v_and_b32_e32 v3, s8, v2 -; FLAT-NEXT: v_and_b32_e32 v2, s8, v4 -; FLAT-NEXT: v_and_b32_e32 v5, s3, v6 -; FLAT-NEXT: v_and_b32_e32 v4, s3, v8 -; FLAT-NEXT: v_and_b32_e32 v7, s8, v6 -; FLAT-NEXT: v_and_b32_e32 v6, s8, v8 +; FLAT-NEXT: v_perm_b32 v6, 0, v0, s10 +; FLAT-NEXT: v_perm_b32 v4, 0, v3, s10 +; FLAT-NEXT: v_perm_b32 v2, 0, v2, s10 +; FLAT-NEXT: v_perm_b32 v8, 0, v1, s10 +; FLAT-NEXT: v_and_b32_e32 v1, s0, v2 +; FLAT-NEXT: v_and_b32_e32 v0, s0, v4 +; FLAT-NEXT: v_and_b32_e32 v3, s1, v2 +; FLAT-NEXT: v_and_b32_e32 v2, s1, v4 +; FLAT-NEXT: v_and_b32_e32 v5, s0, v6 +; FLAT-NEXT: v_and_b32_e32 v4, s0, v8 +; FLAT-NEXT: v_and_b32_e32 v7, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v6, s1, v8 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5] @@ -712,14 +712,14 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s0, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s0, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s0, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s0, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s1, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s1, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s1, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s1, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s2, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s2, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s2, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s2, v6 +; FLAT-NEXT: v_and_b32_e32 v3, s3, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s3, v2 +; FLAT-NEXT: v_and_b32_e32 v7, s3, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s3, v6 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] @@ -728,14 +728,14 @@ ; FLAT-NEXT: v_or_b32_e32 v2, v2, v0 ; FLAT-NEXT: v_or_b32_e32 v7, v7, v5 ; FLAT-NEXT: v_or_b32_e32 v6, v6, v4 -; FLAT-NEXT: v_and_b32_e32 v1, s9, v3 -; FLAT-NEXT: v_and_b32_e32 v0, s9, v2 -; FLAT-NEXT: v_and_b32_e32 v5, s9, v7 -; FLAT-NEXT: v_and_b32_e32 v4, s9, v6 -; FLAT-NEXT: v_and_b32_e32 v3, s10, v3 -; FLAT-NEXT: v_and_b32_e32 v2, s10, v2 -; FLAT-NEXT: v_and_b32_e32 v7, s10, v7 -; FLAT-NEXT: v_and_b32_e32 v6, s10, v6 +; FLAT-NEXT: v_and_b32_e32 v1, s8, v3 +; FLAT-NEXT: v_and_b32_e32 v0, s8, v2 +; FLAT-NEXT: v_and_b32_e32 v5, s8, v7 +; FLAT-NEXT: v_and_b32_e32 v4, s8, v6 +; FLAT-NEXT: v_and_b32_e32 v3, s9, v3 +; FLAT-NEXT: v_and_b32_e32 v2, s9, v2 +; FLAT-NEXT: v_and_b32_e32 v7, s9, v7 +; FLAT-NEXT: v_and_b32_e32 v6, s9, v6 ; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] ; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -661,11 +661,7 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 -; GFX9-NEXT: v_xor_b32_e32 v2, v4, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 @@ -674,10 +670,14 @@ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v5, v3, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v5 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5 +; GFX9-NEXT: v_add_u32_e32 v6, v3, v4 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: v_mul_hi_u32 v3, v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 ; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 @@ -734,33 +734,33 @@ ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, v2, v1 -; GFX9-NEXT: v_mul_hi_u32 v5, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, 0, v4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_mul_hi_u32 v4, v4, v2 -; GFX9-NEXT: v_add_u32_e32 v5, v2, v4 -; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX9-NEXT: v_mul_hi_u32 v4, v2, v1 +; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_mul_hi_u32 v3, v3, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v4 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v2, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_mul_hi_u32 v2, v2, v0 ; GFX9-NEXT: v_mul_lo_u32 v2, v2, v1 -; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v4, v1 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, v3, v1 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_add_u32_e32 v5, v4, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX9-NEXT: v_add_u32_e32 v5, v3, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v3 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %d = srem i32 %a, %b ret i32 %d diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll --- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -8,37 +8,37 @@ define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -72,16 +72,16 @@ ; VI-LABEL: test_copy_v4i8_x2: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 @@ -102,25 +102,25 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_mov_b32 s18, 0 +; SI-NEXT: s_mov_b32 s19, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[6:7] +; SI-NEXT: s_mov_b64 s[16:17], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 ; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s8, s4 +; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s14, s10 +; SI-NEXT: s_mov_b32 s15, s11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; @@ -160,61 +160,60 @@ define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v4i8_x4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x11 -; SI-NEXT: s_mov_b32 s15, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s15 +; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_mov_b32 s18, s14 -; SI-NEXT: s_mov_b32 s19, s15 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s16, s2 -; SI-NEXT: s_mov_b32 s17, s3 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_mov_b32 s3, s15 -; SI-NEXT: s_mov_b32 s12, s6 -; SI-NEXT: s_mov_b32 s13, s7 -; SI-NEXT: s_mov_b32 s6, s14 -; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s16, s6 +; SI-NEXT: s_mov_b32 s17, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_mov_b32 s0, s10 +; SI-NEXT: s_mov_b32 s1, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s18, s2 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 +; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_x4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s11, 0xf000 -; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 +; VI-NEXT: s_mov_b32 s0, s10 +; VI-NEXT: v_mov_b32_e32 v1, s13 +; VI-NEXT: v_add_u32_e32 v0, vcc, s12, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s12, s2 -; VI-NEXT: s_mov_b32 s13, s3 -; VI-NEXT: s_mov_b32 s2, s10 -; VI-NEXT: s_mov_b32 s3, s11 -; VI-NEXT: s_mov_b32 s8, s6 -; VI-NEXT: s_mov_b32 s9, s7 -; VI-NEXT: s_mov_b32 s6, s10 -; VI-NEXT: s_mov_b32 s7, s11 -; VI-NEXT: s_mov_b32 s14, s10 -; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s12, s6 +; VI-NEXT: s_mov_b32 s13, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_mov_b32 s1, s11 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 +; VI-NEXT: s_mov_b32 s14, s2 +; VI-NEXT: s_mov_b32 s15, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -230,76 +229,75 @@ ; SI-LABEL: test_copy_v4i8_extra_use: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s0, 0xff00 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_movk_i32 s1, 0xff +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s12, 0xff00 +; SI-NEXT: s_movk_i32 s13, 0xff +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: v_and_b32_e32 v2, s0, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 9, v0 -; SI-NEXT: v_and_b32_e32 v0, s1, v0 -; SI-NEXT: v_and_b32_e32 v3, s0, v1 +; SI-NEXT: v_and_b32_e32 v4, s12, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v1, s1, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v2, s12, v0 +; SI-NEXT: v_and_b32_e32 v3, s13, v3 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, s13, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 +; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v4i8_extra_use: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s8, 0xff00 +; VI-NEXT: s_movk_i32 s10, 0x900 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: s_movk_i32 s8, 0xff00 +; VI-NEXT: s_movk_i32 s9, 0xff ; VI-NEXT: s_mov_b32 s0, s6 ; VI-NEXT: s_mov_b32 s1, s7 -; VI-NEXT: s_movk_i32 s9, 0xff ; VI-NEXT: s_mov_b32 s6, s2 ; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: s_movk_i32 s10, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v3, s8, v1 +; VI-NEXT: v_and_b32_e32 v4, s8, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 +; VI-NEXT: v_add_u16_e32 v3, 9, v0 ; VI-NEXT: v_and_b32_e32 v1, s9, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_or_b32_e32 v1, v4, v1 ; VI-NEXT: v_and_b32_e32 v2, s8, v0 -; VI-NEXT: v_add_u16_e32 v0, 9, v0 -; VI-NEXT: v_and_b32_e32 v0, s9, v0 -; VI-NEXT: v_or_b32_e32 v1, v3, v1 -; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: v_and_b32_e32 v3, s9, v3 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: v_add_u16_e32 v1, s10, v1 -; VI-NEXT: v_add_u16_e32 v0, s10, v0 +; VI-NEXT: v_add_u16_e32 v2, s10, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -316,33 +314,33 @@ ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s14, 0 -; SI-NEXT: s_mov_b32 s15, s3 +; SI-NEXT: s_mov_b32 s18, 0 +; SI-NEXT: s_mov_b32 s19, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[12:13], s[10:11] +; SI-NEXT: s_mov_b64 s[16:17], s[10:11] ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 -; SI-NEXT: s_mov_b32 s16, 0xff00 -; SI-NEXT: s_movk_i32 s17, 0xff -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 ; SI-NEXT: s_mov_b32 s0, s8 ; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: s_mov_b32 s8, s6 -; SI-NEXT: s_mov_b32 s9, s7 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s8, 0xff00 +; SI-NEXT: s_movk_i32 s9, 0xff +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s12, s6 +; SI-NEXT: s_mov_b32 s13, s7 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: v_and_b32_e32 v4, s16, v1 +; SI-NEXT: v_and_b32_e32 v4, s8, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 -; SI-NEXT: v_and_b32_e32 v2, s16, v0 -; SI-NEXT: v_and_b32_e32 v3, s17, v3 +; SI-NEXT: v_and_b32_e32 v2, s8, v0 +; SI-NEXT: v_and_b32_e32 v3, s9, v3 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v1, s17, v1 +; SI-NEXT: v_and_b32_e32 v1, s9, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -350,7 +348,7 @@ ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -358,39 +356,41 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_movk_i32 s12, 0xff00 -; VI-NEXT: s_movk_i32 s13, 0xff -; VI-NEXT: s_movk_i32 s14, 0x900 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s14, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s8, s4 +; VI-NEXT: s_movk_i32 s4, 0xff00 +; VI-NEXT: s_mov_b32 s9, s5 +; VI-NEXT: s_movk_i32 s5, 0xff +; VI-NEXT: s_movk_i32 s6, 0x900 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s15, s11 +; VI-NEXT: s_mov_b32 s2, s10 +; VI-NEXT: s_mov_b32 s3, s11 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: v_and_b32_e32 v4, s12, v1 +; VI-NEXT: v_and_b32_e32 v4, s4, v1 ; VI-NEXT: v_add_u16_e32 v1, 9, v1 ; VI-NEXT: v_add_u16_e32 v3, 9, v0 -; VI-NEXT: v_and_b32_e32 v1, s13, v1 +; VI-NEXT: v_and_b32_e32 v1, s5, v1 ; VI-NEXT: v_or_b32_e32 v1, v4, v1 -; VI-NEXT: v_and_b32_e32 v2, s12, v0 -; VI-NEXT: v_and_b32_e32 v3, s13, v3 +; VI-NEXT: v_and_b32_e32 v2, s4, v0 +; VI-NEXT: v_and_b32_e32 v3, s5, v3 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: v_add_u16_e32 v1, s14, v1 -; VI-NEXT: v_add_u16_e32 v2, s14, v2 +; VI-NEXT: v_add_u16_e32 v1, s6, v1 +; VI-NEXT: v_add_u16_e32 v2, s6, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -405,41 +405,41 @@ define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { ; SI-LABEL: test_copy_v3i8_align4: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 +; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_copy_v3i8_align4: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -65,21 +65,22 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32: @@ -131,16 +132,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v2, v1 ; SI-NEXT: v_ffbh_u32_e32 v3, v0 @@ -148,7 +149,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v2i32: @@ -206,16 +208,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_v4i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v2 @@ -229,7 +231,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_v4i32: @@ -299,9 +302,9 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -501,7 +504,6 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -509,7 +511,7 @@ ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v4, v2 ; SI-NEXT: v_ffbh_u32_e32 v5, v3 @@ -520,7 +522,8 @@ ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64: @@ -588,7 +591,6 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { ; SI-LABEL: v_ctlz_i64_trunc: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 @@ -596,8 +598,8 @@ ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v3 ; SI-NEXT: v_ffbh_u32_e32 v5, v4 @@ -607,7 +609,8 @@ ; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 ; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; SI-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i64_trunc: @@ -615,26 +618,26 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v4, 0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v3, vcc -; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2] +; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_ffbh_u32_e32 v4, v0 -; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4 +; VI-NEXT: v_ffbh_u32_e32 v2, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 ; VI-NEXT: v_ffbh_u32_e32 v5, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i64_trunc: @@ -676,19 +679,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -742,19 +746,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -809,23 +814,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -885,23 +891,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v1, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -961,18 +968,19 @@ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i8_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1030,9 +1038,9 @@ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i16_sel_eq_neg1: ; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_mov_b32 s6, s2 ; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,19 +1110,20 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind { ; SI-LABEL: v_ctlz_i7_sel_eq_neg1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_ffbh_u32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -261,19 +261,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind { ; SI-LABEL: load_v4i8_to_v4f32_2_uses: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s6, s10 +; SI-NEXT: s_mov_b32 s7, s11 ; SI-NEXT: s_movk_i32 s12, 0xff ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 @@ -284,7 +284,7 @@ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v0, s12, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5 @@ -297,44 +297,44 @@ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; VI-NEXT: v_mov_b32_e32 v4, 9 -; VI-NEXT: s_movk_i32 s8, 0x900 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_mov_b32 s11, 0xf000 +; VI-NEXT: s_mov_b32 s10, -1 +; VI-NEXT: s_mov_b32 s6, s10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 -; VI-NEXT: v_mov_b32_e32 v6, s8 +; VI-NEXT: v_mov_b32_e32 v4, 9 +; VI-NEXT: s_mov_b32 s7, s11 +; VI-NEXT: s_movk_i32 s0, 0x900 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5 +; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 -; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v5 -; VI-NEXT: v_add_u16_e32 v9, 9, v5 +; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5 +; VI-NEXT: v_add_u16_e32 v8, 9, v5 ; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7 -; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 +; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 -; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_add_u16_e32 v0, s0, v0 +; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x @@ -358,31 +358,30 @@ ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5 +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6 +; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:1 +; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:2 +; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3 +; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v9, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v0 -; SI-NEXT: v_or_b32_e32 v0, v8, v6 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0 +; SI-NEXT: v_or_b32_e32 v0, v8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v2 +; SI-NEXT: v_or_b32_e32 v6, v0, v6 +; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2 ; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 -; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24 +; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:24 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -402,33 +401,35 @@ ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v10, v[2:3] -; VI-NEXT: flat_load_ubyte v11, v[4:5] -; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v6, vcc, 5, v0 +; VI-NEXT: v_add_u32_e32 v6, vcc, 6, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v8, vcc, 1, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ubyte v2, v[2:3] -; VI-NEXT: flat_load_ubyte v3, v[4:5] -; VI-NEXT: flat_load_ubyte v4, v[6:7] -; VI-NEXT: flat_load_ubyte v5, v[8:9] +; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0 +; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v0 +; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ubyte v6, v[6:7] +; VI-NEXT: flat_load_ubyte v7, v[8:9] +; VI-NEXT: flat_load_ubyte v8, v[10:11] +; VI-NEXT: flat_load_ubyte v9, v[12:13] ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: flat_load_ubyte v1, v[2:3] +; VI-NEXT: flat_load_ubyte v2, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v10 +; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6 +; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v2 -; VI-NEXT: v_or_b32_sdwa v2, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8 +; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5 -; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v4 -; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3 -; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2 +; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2 ; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -880,42 +881,42 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) { ; SI-LABEL: cvt_ubyte0_or_multiuse: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s2 -; SI-NEXT: s_mov_b32 s5, s3 -; SI-NEXT: s_mov_b32 s2, 0 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_mov_b32 s0, s6 +; SI-NEXT: s_mov_b32 s1, s7 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; SI-NEXT: v_add_f32_e32 v0, v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: cvt_ubyte0_or_multiuse: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0 ; VI-NEXT: v_add_f32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm bb: %lid = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -9,11 +9,10 @@ ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -21,11 +20,10 @@ ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen ; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen -; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) +; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -91,21 +89,21 @@ ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll @@ -7,23 +7,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_uge_f64: @@ -59,23 +59,23 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_oge_f64: @@ -111,23 +111,23 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ugt_f64: @@ -163,23 +163,23 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmax_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax_legacy_ogt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll @@ -5,23 +5,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_uge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_uge_f64: @@ -57,23 +57,23 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ugt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ugt_f64: @@ -109,23 +109,23 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ule_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ule_f64: @@ -161,23 +161,23 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ult_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ult_f64: @@ -213,23 +213,23 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_oge_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_oge_f64: @@ -265,23 +265,23 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ogt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ogt_f64: @@ -317,23 +317,23 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_ole_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_ole_f64: @@ -369,23 +369,23 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { ; SI-LABEL: test_fmin_legacy_olt_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s10, 0 -; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin_legacy_olt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll --- a/llvm/test/CodeGen/AMDGPU/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/fpow.ll @@ -110,18 +110,18 @@ ; GFX6-LABEL: v_pow_v2f16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_log_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_log_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1 +; GFX6-NEXT: v_log_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v0 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -137,30 +137,30 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_exp_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX9-NEXT: v_log_f32_e32 v2, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow @@ -172,21 +172,21 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_log_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_log_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_log_f32_e32 v4, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v2 +; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v3 ; GFX6-NEXT: v_exp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v4 +; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v4 ; GFX6-NEXT: v_exp_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -201,30 +201,30 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_exp_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX9-NEXT: v_log_f32_e32 v2, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) @@ -266,30 +266,30 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_exp_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; GFX9-NEXT: v_log_f32_e32 v2, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) @@ -336,30 +336,30 @@ ; GFX8-NEXT: v_log_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX8-NEXT: v_exp_f32_e32 v1, v2 ; GFX8-NEXT: v_exp_f32_e32 v0, v0 -; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_exp_f32_e32 v2, v2 ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1 -; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; GFX9-NEXT: v_log_f32_e32 v2, v2 ; GFX9-NEXT: v_log_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_exp_f32_e32 v1, v2 ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -24,25 +24,25 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB0_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 -; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v1 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, v3, s2 +; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 +; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 +; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB0_1 @@ -88,29 +88,29 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB1_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v3, v2, v1 -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3 -; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2 -; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3 -; GFX9-NEXT: v_not_b32_e32 v3, v3 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 +; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2 +; GFX9-NEXT: v_not_b32_e32 v6, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, 1, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3 -; GFX9-NEXT: v_add_u32_e32 v4, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 -; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6 +; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 +; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v4, s6, v6 ; GFX9-NEXT: v_add_u32_e32 v3, s6, v3 -; GFX9-NEXT: v_add_u32_e32 v5, s6, v5 ; GFX9-NEXT: s_add_u32 s6, s6, 1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: s_addc_u32 s7, s7, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: s_add_u32 s4, s4, 4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc ; GFX9-NEXT: s_addc_u32 s5, s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400 ; GFX9-NEXT: global_store_dword v[1:2], v3, off ; GFX9-NEXT: s_cbranch_scc0 BB1_1 @@ -162,15 +162,15 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3 -; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v3 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v3 +; GFX9-NEXT: v_add_u32_e32 v6, -1, v3 +; GFX9-NEXT: v_sub_u32_e32 v7, s6, v4 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: s_add_i32 s6, s6, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GFX9-NEXT: s_add_u32 s4, s4, 4 ; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX9-NEXT: s_addc_u32 s5, s5, 0 @@ -222,10 +222,10 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: BB3_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2 ; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 @@ -275,19 +275,19 @@ ; GFX9-NEXT: BB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7 +; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1] +; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: s_and_b64 vcc, exec, vcc ; GFX9-NEXT: global_store_short v[5:6], v2, off ; GFX9-NEXT: s_cbranch_vccz BB4_1 @@ -326,15 +326,15 @@ ; GFX9-NEXT: BB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-NEXT: v_mov_b32_e32 v7, s5 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1] -; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1 -; GFX9-NEXT: v_trunc_f32_e32 v8, v8 -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8 -; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] +; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7 +; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -102,18 +102,18 @@ ; ; GFX10-DL-LABEL: udot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -246,19 +246,19 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: s_and_b32 s6, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s3, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s2, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s6 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -377,18 +377,18 @@ ; ; GFX10-DL-LABEL: idot2: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -648,18 +648,18 @@ ; ; GFX10-DL-LABEL: udot2_alt_AddOperands: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -854,14 +854,14 @@ ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s2, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_and_b32 s1, s4, 0xffff -; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -873,14 +873,14 @@ ; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NODL-NEXT: s_and_b32 s1, s4, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -892,14 +892,14 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-DL-NEXT: s_and_b32 s1, s4, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -1039,18 +1039,18 @@ ; ; GFX10-DL-LABEL: udot2_v4i16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1171,18 +1171,18 @@ ; ; GFX10-DL-LABEL: udot2_v4i16_Hi: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x4 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1312,18 +1312,18 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX10-DL-NEXT: s_and_b32 s2, s2, s8 -; GFX10-DL-NEXT: s_and_b32 s4, s4, s8 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s7 +; GFX10-DL-NEXT: s_and_b32 s4, s4, s7 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1457,15 +1457,15 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s3, s8 -; GFX10-DL-NEXT: s_and_b32 s5, s5, s8 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s7 +; GFX10-DL-NEXT: s_and_b32 s5, s5, s7 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 @@ -1602,18 +1602,18 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: s_and_b32 s7, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s2, s3, s2 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16 +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_and_b32 s7, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1751,20 +1751,20 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s5 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2049,21 +2049,21 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -2349,21 +2349,21 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s3, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: s_and_b32 s2, s2, s4 +; GFX10-DL-NEXT: s_and_b32 s3, s3, s4 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -2621,15 +2621,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -2705,22 +2705,22 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: flat_load_ushort v2, v[2:3] ; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ushort v1, v[2:3] +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0) -; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -2732,23 +2732,23 @@ ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) +; GFX9-NODL-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off @@ -2760,23 +2760,23 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off ; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8 -; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 ; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0 -; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 +; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -108,29 +108,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s3, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -226,21 +226,21 @@ ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s3, s2 +; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010 ; GFX8-NEXT: s_sext_i32_i8 s1, s0 -; GFX8-NEXT: v_mov_b32_e32 v4, s3 ; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010 ; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_ashr_i32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s3, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -257,21 +257,21 @@ ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s2, 0x80008 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010 +; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 +; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010 ; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 ; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008 -; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010 ; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v5, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off @@ -282,15 +282,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -451,15 +451,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -928,30 +928,30 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sext_i32_i8 s1, s2 -; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010 -; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0 -; GFX8-NEXT: s_sext_i32_i8 s1, s0 -; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s0 +; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80010 +; GFX8-NEXT: s_ashr_i32 s4, s1, 24 +; GFX8-NEXT: s_sext_i32_i8 s1, s1 +; GFX8-NEXT: s_ashr_i32 s2, s0, 24 +; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010 +; GFX8-NEXT: s_sext_i32_i8 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010 -; GFX8-NEXT: s_ashr_i32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v6, s3 -; GFX8-NEXT: s_ashr_i32 s0, s0, 24 +; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s0, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, v3, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s3, v6, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1037,16 +1037,17 @@ ; ; GFX10-DL-LABEL: idot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000 @@ -1054,25 +1055,25 @@ ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0 ; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v3 ; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000 ; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000 -; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 8, s2 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 +; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s5 -; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, v6, 16, v3 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v11, 16, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -111,29 +111,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s3, s4, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -278,15 +278,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -448,15 +448,15 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -608,24 +608,25 @@ ; ; GFX10-DL-LABEL: udot2_8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s3, s0, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s1, s2 +; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s1, 0xff +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_and_b32 s3, s2, s1 +; GFX10-DL-NEXT: s_and_b32 s1, s0, s1 +; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -752,14 +753,14 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; @@ -943,16 +944,17 @@ ; ; GFX10-DL-LABEL: udot4_CommutationAccrossMADs: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 @@ -1138,26 +1140,26 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0 +; GFX10-DL-NEXT: s_and_b32 s6, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1345,25 +1347,25 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_and_b32 s8, s3, s2 -; GFX10-DL-NEXT: s_and_b32 s2, s4, s2 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_and_b32 s8, s2, s5 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s2, v0 -; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24 -; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s5, v0 +; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s5, v0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0 @@ -1458,21 +1460,21 @@ ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_sext_i32_i8 s3, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX8-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX8-NEXT: s_sext_i32_i8 s1, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s3 -; GFX8-NEXT: s_sext_i32_i8 s4, s0 -; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX8-NEXT: s_lshr_b32 s2, s2, 24 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_lshr_b32 s0, s0, 24 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -1489,21 +1491,21 @@ ; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-NODL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0 -; GFX9-NODL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off @@ -1520,21 +1522,21 @@ ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80010 -; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x80008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0 -; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x80010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010 ; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 ; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -1542,15 +1544,16 @@ ; ; GFX10-DL-LABEL: notdot4_mixedtypes: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008 ; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008 @@ -1742,29 +1745,29 @@ ; GFX10-DL: ; %bb.0: ; %entry ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s3, 0xff -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-DL-NEXT: s_movk_i32 s5, 0xff +; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_and_b32 s7, s4, s3 -; GFX10-DL-NEXT: s_and_b32 s3, s5, s3 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s3, v2 -; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_and_b32 s7, s2, s5 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-DL-NEXT: s_and_b32 s5, s3, s5 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1933,38 +1936,39 @@ ; ; GFX10-DL-LABEL: udot4_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16 ; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 ; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2 -; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 +; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -2084,34 +2088,34 @@ ; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NODL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NODL-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 -; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NODL-NEXT: s_endpgm ; @@ -2120,48 +2124,49 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1 +; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 24 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 -; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s2, v5 -; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot4_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0 ; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -156,29 +156,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -321,48 +321,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX8-NEXT: s_lshr_b32 s1, s0, 12 -; GFX8-NEXT: s_lshr_b32 s7, s2, 12 -; GFX8-NEXT: v_mov_b32_e32 v4, s5 +; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_lshr_b32 s2, s0, 12 +; GFX8-NEXT: s_lshr_b32 s4, s1, 12 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40010 -; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s1 -; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s7 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s1, s1, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -372,48 +372,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-NEXT: s_lshr_b32 s7, s2, 12 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40010 -; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v5, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -423,87 +423,88 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 12 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 ; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4 -; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40010 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6 -; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1 -; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s5 -; GFX9-DL-NEXT: s_bfe_i32 s1, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 +; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 +; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 @@ -814,39 +815,40 @@ ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008 +; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 +; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 +; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4 -; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 +; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 @@ -1643,54 +1645,54 @@ ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s5, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s9, s7, 24 -; GFX8-NEXT: s_lshl_b32 s11, s7, 20 -; GFX8-NEXT: s_lshl_b32 s5, s1, 28 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s1, 20 -; GFX8-NEXT: s_lshl_b32 s13, s1, 24 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NEXT: s_lshl_b32 s29, s7, 28 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s21, s7, 8 +; GFX8-NEXT: s_lshl_b32 s23, s7, 12 +; GFX8-NEXT: s_lshl_b32 s17, s1, 28 +; GFX8-NEXT: s_lshl_b32 s25, s7, 16 +; GFX8-NEXT: s_lshl_b32 s27, s7, 24 +; GFX8-NEXT: s_lshl_b32 s19, s7, 4 +; GFX8-NEXT: s_lshl_b32 s7, s7, 20 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 +; GFX8-NEXT: s_lshl_b32 s9, s1, 8 +; GFX8-NEXT: s_lshl_b32 s11, s1, 12 +; GFX8-NEXT: s_lshl_b32 s13, s1, 16 +; GFX8-NEXT: s_lshl_b32 s15, s1, 24 +; GFX8-NEXT: s_lshl_b32 s5, s1, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 20 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s28 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s26 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 ; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 16 -; GFX8-NEXT: v_mov_b32_e32 v5, s8 -; GFX8-NEXT: s_lshl_b32 s9, s1, 16 -; GFX8-NEXT: s_lshl_b32 s11, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4 -; GFX8-NEXT: s_lshl_b32 s5, s1, 12 -; GFX8-NEXT: s_lshl_b32 s9, s7, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, s24 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v6, s16 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s13, s1, 8 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 4 -; GFX8-NEXT: v_mov_b32_e32 v7, s10 -; GFX8-NEXT: s_lshl_b32 s9, s1, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v7, s22 +; GFX8-NEXT: s_ashr_i64 s[32:33], s[18:19], 60 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX8-NEXT: v_mov_b32_e32 v8, s20 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v9, s24 -; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v9, s32 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2 -; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v5, v2 +; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s30, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1702,17 +1704,21 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s2, 15 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s2 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s9 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s5 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-NEXT: s_lshr_b32 s13, s6, 28 ; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 @@ -1722,14 +1728,10 @@ ; GFX9-NEXT: s_and_b32 s18, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] @@ -1769,17 +1771,21 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s4, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s8, s9 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s12 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s4, s5 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 ; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 ; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 @@ -1789,14 +1795,10 @@ ; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6 -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] @@ -1830,13 +1832,14 @@ ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -2273,14 +2276,15 @@ ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -156,29 +156,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -304,46 +304,46 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -352,46 +352,46 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -400,58 +400,59 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -606,46 +607,46 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -654,46 +655,46 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s1, s1, 15 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -702,58 +703,59 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 ; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -909,48 +911,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -960,48 +962,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1011,61 +1013,62 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1208,48 +1211,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1259,48 +1262,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -1310,61 +1313,62 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1909,29 +1913,29 @@ ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v2, v3 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry +; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s5, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -2032,46 +2036,46 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s1, s1, 15 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v5, s4 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s9 -; GFX8-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX8-NEXT: s_lshr_b32 s2, s2, 28 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: v_mov_b32_e32 v9, s11 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s12, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s2 ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -2080,53 +2084,53 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-NEXT: s_and_b32 s4, s0, 15 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s6, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-NEXT: s_bfe_u32 s1, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s14, s15 ; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s12, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s0, v4 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s8, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s6, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-NEXT: v_pk_mul_lo_u16 v6, s1, v6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v5 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s18, s6, 15 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s8, s9 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s17 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s5, v0 +; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s6 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s5, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -2135,65 +2139,66 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28 -; GFX9-DL-NEXT: s_and_b32 s4, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s13 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 ; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s14, s15 ; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s12, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s0, v4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s8, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v3 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v6, s1, v6 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v5 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s9 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s16, s17 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s5, v0 +; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s10, s11 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s5, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_sdwa v5, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_add_u32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v4, v4, v3 +; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2 +; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -2548,45 +2553,46 @@ ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s6, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s8, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s9, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s6, s8 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s7, s9 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s4 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s7 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff ; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s5 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 ; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 ; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 -; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s7 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s4, s8 +; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s8 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 @@ -2598,7 +2604,7 @@ ; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -2696,48 +2702,48 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s1, s2, 15 -; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s9, s0, 15 +; GFX8-NEXT: s_and_b32 s16, s1, 15 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_lshr_b32 s2, s0, 28 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s1, s0, 15 -; GFX8-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX8-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX8-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v7, s6 -; GFX8-NEXT: s_lshr_b32 s11, s2, 28 -; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 -; GFX8-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 -; GFX8-NEXT: s_lshr_b32 s0, s0, 28 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_mov_b32_e32 v7, s13 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: v_mov_b32_e32 v9, s11 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2747,48 +2753,48 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, 15 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-NEXT: s_and_b32 s9, s0, 15 +; GFX9-NEXT: s_and_b32 s16, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-NEXT: s_and_b32 s1, s0, 15 -; GFX9-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -2798,61 +2804,62 @@ ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s1, s2, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40008 -; GFX9-DL-NEXT: s_and_b32 s1, s0, 15 -; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5 -; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x40010 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s6 -; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s2 -; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 +; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 +; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off +; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -3078,18 +3085,18 @@ ; ; GFX10-DL-LABEL: udot8_variant1: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 -; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s4, s3, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -8,11 +8,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, 0x3e7, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -21,11 +21,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e7 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -43,11 +43,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s2 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -57,13 +57,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -73,12 +73,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_and_b32 s1, s4, 0xffff ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; CI-NEXT: s_or_b32 s0, s1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -95,11 +95,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -113,18 +113,18 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_and_b32 s0, s4, 0xffff +; VI-NEXT: s_and_b32 s1, s4, 0xffff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s2 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s2 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -133,18 +133,18 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_and_b32 s0, s4, 0xffff +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s1, s4, 0xffff ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: s_lshl_b32 s2, s1, 16 -; CI-NEXT: s_or_b32 s0, s0, s2 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: s_lshl_b32 s2, s0, 16 +; CI-NEXT: s_or_b32 s1, s1, s2 +; CI-NEXT: v_mov_b32_e32 v2, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -162,11 +162,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s2 +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s4, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -176,13 +176,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -192,12 +192,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; CI-NEXT: s_or_b32 s0, s1, s0 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -216,16 +216,16 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s1 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -234,17 +234,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s1, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s1 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -253,17 +253,17 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s1, s2, 0xffff0000 -; CI-NEXT: s_or_b32 s1, s0, s1 -; CI-NEXT: v_mov_b32_e32 v2, s1 +; CI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; CI-NEXT: s_or_b32 s0, s1, s0 +; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s1 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -282,20 +282,20 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-NEXT: s_lshr_b32 s1, s4, 16 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s1, s2, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s0, s1 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s0 +; GFX9-NEXT: ; use s1 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s1 +; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -304,21 +304,21 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: s_lshr_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s1, s2, 16 -; VI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; VI-NEXT: s_or_b32 s2, s0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s2, s0, 16 +; VI-NEXT: s_and_b32 s0, s0, 0xffff0000 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s0 +; VI-NEXT: ; use s1 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: ;;#ASMSTART -; VI-NEXT: ; use s1 +; VI-NEXT: ; use s2 ; VI-NEXT: ;;#ASMEND ; VI-NEXT: s_endpgm ; @@ -327,21 +327,21 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_lshr_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshr_b32 s1, s2, 16 -; CI-NEXT: s_lshl_b32 s2, s1, 16 -; CI-NEXT: s_or_b32 s2, s0, s2 +; CI-NEXT: s_lshr_b32 s0, s0, 16 +; CI-NEXT: s_lshl_b32 s2, s0, 16 +; CI-NEXT: s_or_b32 s2, s1, s2 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s0 +; CI-NEXT: ; use s1 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: ;;#ASMSTART -; CI-NEXT: ; use s1 +; CI-NEXT: ; use s0 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_endpgm %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr @@ -363,11 +363,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x3e7 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x3e7 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -376,11 +376,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff +; CIVI-NEXT: s_and_b32 s0, s0, 0xffff ; CIVI-NEXT: s_or_b32 s0, s0, 0x3e70000 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -397,11 +397,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -411,13 +411,13 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_load_dword s2, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_load_dword s0, s[2:3], 0x0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: s_lshl_b32 s1, s4, 16 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, s2, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_and_b32 s0, s0, 0xffff +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -427,12 +427,12 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0xc ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s0, s2, 0xffff +; CI-NEXT: s_and_b32 s0, s0, 0xffff ; CI-NEXT: s_or_b32 s0, s0, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -448,11 +448,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s0, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, 0x4500, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -462,11 +462,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff0000 +; CIVI-NEXT: s_and_b32 s0, s0, 0xffff0000 ; CIVI-NEXT: s_or_b32 s0, s0, 0x4500 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -482,11 +482,11 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, 0x4500 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, 0x4500 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -495,11 +495,11 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: s_load_dword s0, s[2:3], 0x0 ; CIVI-NEXT: v_mov_b32_e32 v1, s1 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_and_b32 s0, s2, 0xffff +; CIVI-NEXT: s_and_b32 s0, s0, 0xffff ; CIVI-NEXT: s_or_b32 s0, s0, 0x45000000 ; CIVI-NEXT: v_mov_b32_e32 v2, s0 ; CIVI-NEXT: flat_store_dword v[0:1], v2 @@ -515,19 +515,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x3e7 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x3e7 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v3, s4, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0: @@ -538,14 +538,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0: @@ -556,14 +556,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x3e7, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x3e7, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -581,19 +581,19 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v2, v4, v3, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: @@ -605,15 +605,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: s_lshr_b32 s1, s4, 16 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshr_b32 s0, s4, 16 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, s1, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_reghi: @@ -625,15 +625,15 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: s_lshr_b32 s1, s4, 16 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshr_b32 s0, s4, 16 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, s1, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -652,18 +652,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v3, 53, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, v1, 53, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -674,14 +674,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 53, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 53, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -692,14 +692,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 53, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 53, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -717,37 +717,37 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x3e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x3e7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3e70000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3e70000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1: @@ -758,14 +758,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x3e70000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x3e70000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -786,32 +786,32 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, -15, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, -15, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xfff10000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xfff10000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_1_inlineimm: @@ -822,14 +822,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0xfff10000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0xfff10000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -846,19 +846,19 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0: @@ -869,14 +869,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 0x4500, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 0x4500, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0: @@ -887,14 +887,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x4500, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x4500, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -915,14 +915,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, 53 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 53 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -933,14 +933,14 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_or_b32_e32 v2, 53, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; VI-NEXT: v_or_b32_e32 v0, 53, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -951,14 +951,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; CI-NEXT: v_or_b32_e32 v2, 53, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; CI-NEXT: v_or_b32_e32 v0, 53, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -975,37 +975,37 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_movk_i32 s0, 0x4500 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x45000000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x45000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1: @@ -1016,14 +1016,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x45000000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x45000000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1044,32 +1044,32 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX9-NEXT: v_lshl_or_b32 v2, 35, 16, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v0, 35, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x230000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x230000 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_1_inlineimm: @@ -1080,14 +1080,14 @@ ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; CI-NEXT: v_or_b32_e32 v2, 0x230000, v2 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; CI-NEXT: v_or_b32_e32 v0, 0x230000, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1167,20 +1167,20 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: s_lshl_b32 s2, s4, 4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, s0, v3, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1188,20 +1188,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 4 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s0, 0xffff, s2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, s0, v3, v4 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1209,20 +1209,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; CI-NEXT: v_mov_b32_e32 v3, 0x3e703e7 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: s_lshl_b32 s2, s4, 4 -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_lshl_b32 s0, 0xffff, s2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; CI-NEXT: s_lshl_b32 s0, s4, 4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; CI-NEXT: v_mov_b32_e32 v1, 0x3e703e7 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, s0, v3, v4 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_bfi_b32 v0, s0, v1, v0 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1239,80 +1239,78 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, 0xffff -; GFX9-NEXT: s_mov_b32 s7, 0x12341234 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s7, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: s_mov_b32 s0, 0x12341234 +; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0x12341234 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_bfi_b32 v2, v2, s7, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; VI-NEXT: s_mov_b32 s0, 0x12341234 +; VI-NEXT: v_bfi_b32 v0, v1, s0, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 -; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s6, 0x12341234 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mov_b32_e32 v1, s5 -; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: flat_load_dword v4, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; CI-NEXT: flat_load_dword v3, v[0:1] -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2 -; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CI-NEXT: flat_load_dword v2, v[2:3] +; CI-NEXT: flat_load_dword v0, v[0:1] +; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: s_mov_b32 s0, 0x12341234 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v4 -; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2 +; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v2, v2, s6, v3 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_bfi_b32 v0, v1, s0, v0 +; CI-NEXT: flat_store_dword v[4:5], v0 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 @@ -1356,13 +1354,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; VI-NEXT: v_or_b32_e32 v0, s1, v0 +; VI-NEXT: v_or_b32_e32 v0, s0, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1376,13 +1374,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_or_b32_e32 v0, s1, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1423,14 +1421,14 @@ ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1447,13 +1445,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; CI-NEXT: v_or_b32_e32 v0, s1, v0 +; CI-NEXT: v_or_b32_e32 v0, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1498,13 +1496,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1518,13 +1516,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1565,14 +1563,14 @@ ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_lshl_b32 s2, s4, 16 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_lshl_b32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -1589,13 +1587,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s0, s4, 16 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1640,13 +1638,13 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_and_b32 s1, s4, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s0, s4, 0xffff ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; VI-NEXT: v_or_b32_e32 v1, s1, v1 +; VI-NEXT: v_or_b32_e32 v1, s0, v1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1660,13 +1658,13 @@ ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_and_b32 s1, s4, 0xffff +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_and_b32 s0, s4, 0xffff ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_or_b32_e32 v1, s1, v1 +; CI-NEXT: v_or_b32_e32 v1, s0, v1 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1686,51 +1684,51 @@ ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10 -; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s6, s6 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v5, s1, v1 -; GFX9-NEXT: v_bfi_b32 v0, v4, s1, v0 +; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1 +; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: s_load_dword s6, s[4:5], 0x10 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: s_mov_b32 s5, 0 -; VI-NEXT: s_and_b32 s1, s6, s4 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s1, 16 -; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_and_b32 s2, s4, s0 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_lshl_b32 s3, s2, 16 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[4:5] +; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1] +; VI-NEXT: s_or_b32 s0, s2, s3 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_bfi_b32 v1, v5, s0, v1 ; VI-NEXT: v_bfi_b32 v0, v4, s0, v0 @@ -1739,29 +1737,29 @@ ; ; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: s_load_dword s6, s[4:5], 0x4 -; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_load_dword v4, v[0:1] +; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; CI-NEXT: s_load_dword s4, s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_mov_b32 s4, 0xffff +; CI-NEXT: s_mov_b32 s6, 0xffff +; CI-NEXT: s_mov_b32 s7, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_mov_b32 s5, 0 -; CI-NEXT: s_lshl_b32 s2, s6, 16 -; CI-NEXT: s_and_b32 s3, s6, s4 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_lshl_b32 s1, s4, 16 +; CI-NEXT: s_and_b32 s3, s4, s6 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_or_b32 s1, s3, s2 +; CI-NEXT: s_or_b32 s0, s3, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4 -; CI-NEXT: v_lshl_b64 v[4:5], s[4:5], v4 +; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: v_bfi_b32 v1, v5, s1, v1 -; CI-NEXT: v_bfi_b32 v0, v4, s1, v0 +; CI-NEXT: v_bfi_b32 v1, v5, s0, v1 +; CI-NEXT: v_bfi_b32 v0, v4, s0, v0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -1783,24 +1781,24 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: s_mov_b32 s7, 0 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX9-NEXT: s_lshl_b32 s2, s5, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0 +; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1 +; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -1809,20 +1807,20 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_mov_b32 s6, 0xffff -; VI-NEXT: s_mov_b32 s7, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_and_b32 s2, s4, s6 -; VI-NEXT: s_lshl_b32 s3, s2, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: s_lshl_b32 s4, s5, 4 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; VI-NEXT: s_mov_b32 s0, 0xffff +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: s_mov_b32 s1, 0 +; VI-NEXT: s_lshl_b32 s2, s5, 4 +; VI-NEXT: s_and_b32 s3, s4, s0 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; VI-NEXT: s_lshl_b32 s2, s3, 16 +; VI-NEXT: s_or_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1837,20 +1835,20 @@ ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; CI-NEXT: s_mov_b32 s6, 0xffff -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; CI-NEXT: s_and_b32 s2, s4, s6 -; CI-NEXT: s_lshl_b32 s3, s4, 16 -; CI-NEXT: s_or_b32 s2, s2, s3 -; CI-NEXT: s_lshl_b32 s4, s5, 4 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 -; CI-NEXT: s_lshl_b64 s[0:1], s[6:7], s4 +; CI-NEXT: s_mov_b32 s0, 0xffff +; CI-NEXT: s_and_b32 s2, s4, s0 +; CI-NEXT: s_lshl_b32 s4, s4, 16 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: s_or_b32 s2, s2, s4 +; CI-NEXT: s_mov_b32 s1, 0 +; CI-NEXT: s_lshl_b32 s3, s5, 4 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: v_mov_b32_e32 v5, s2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -21,10 +21,10 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -33,10 +33,10 @@ ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) @@ -62,8 +62,8 @@ ; VI-NEXT: s_load_dword s0, s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -73,8 +73,8 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) @@ -123,14 +123,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32: @@ -145,14 +145,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -191,13 +191,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: @@ -208,13 +208,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -251,13 +251,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: @@ -268,13 +268,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -318,14 +318,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo: @@ -340,14 +340,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -394,14 +394,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi: @@ -416,14 +416,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -470,14 +470,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: @@ -492,14 +492,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -547,14 +547,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: @@ -569,14 +569,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -313,16 +313,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -441,16 +441,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -569,16 +569,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -693,16 +693,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -821,16 +821,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -949,16 +949,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1081,16 +1081,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1205,16 +1205,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1333,16 +1333,16 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e] +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e] ; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00] -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -3247,11 +3247,11 @@ ; VERDE-LABEL: image_load_mmo: ; VERDE: ; %bb.0: ; VERDE-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm -; VERDE-NEXT: v_mov_b32_e32 v2, 0 +; VERDE-NEXT: v_mov_b32_e32 v3, 0 ; VERDE-NEXT: s_mov_b32 m0, -1 -; VERDE-NEXT: v_add_i32_e32 v3, vcc, 16, v0 -; VERDE-NEXT: ds_write_b32 v0, v2 -; VERDE-NEXT: ds_write_b32 v3, v2 +; VERDE-NEXT: ds_write_b32 v0, v3 +; VERDE-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; VERDE-NEXT: ds_write_b32 v0, v3 ; VERDE-NEXT: s_waitcnt vmcnt(0) ; VERDE-NEXT: v_mov_b32_e32 v0, v1 ; VERDE-NEXT: s_waitcnt lgkmcnt(0) @@ -3291,9 +3291,9 @@ ; GFX10-LABEL: image_load_mmo: ; GFX10: ; %bb.0: ; GFX10-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x11,0x00,0xf0,0x01,0x01,0x00,0x00] -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x03,0x03,0x00] +; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -1783,13 +1783,13 @@ ; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v11, s13 ; encoding: [0x0d,0x02,0x16,0x7e] ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e] ; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08] -; GFX10-NEXT: v_mov_b32_e32 v10, s12 ; encoding: [0x0c,0x02,0x14,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e] +; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] -; GFX10-NEXT: global_store_dword v[10:11], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x01,0x7d,0x00] +; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -9,11 +9,11 @@ ; VARIANT0: ; %bb.0: ; %entry ; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT0-NEXT: s_mov_b32 s6, 0 ; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT0-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT0-NEXT: v_not_b32_e32 v3, v0 ; VARIANT0-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -30,11 +30,11 @@ ; VARIANT1: ; %bb.0: ; %entry ; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb -; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_mov_b32 s7, 0xf000 ; VARIANT1-NEXT: s_mov_b32 s6, 0 ; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT1-NEXT: v_mov_b32_e32 v2, 0 +; VARIANT1-NEXT: v_not_b32_e32 v3, v0 ; VARIANT1-NEXT: s_waitcnt lgkmcnt(0) ; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 ; VARIANT1-NEXT: s_barrier @@ -51,45 +51,45 @@ ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0 -; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT2-NEXT: v_mov_b32_e32 v2, s3 +; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] +; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; VARIANT2-NEXT: global_store_dword v[1:2], v0, off +; VARIANT2-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off -; VARIANT2-NEXT: v_mov_b32_e32 v5, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier -; VARIANT2-NEXT: global_load_dword v0, v[0:1], off +; VARIANT2-NEXT: global_load_dword v0, v[3:4], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v[3:4], v0, off +; VARIANT2-NEXT: global_store_dword v[1:2], v0, off ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: v_mov_b32_e32 v4, s3 -; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0 -; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; VARIANT3-NEXT: v_mov_b32_e32 v2, s3 +; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0 +; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] +; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; VARIANT3-NEXT: global_store_dword v[1:2], v0, off +; VARIANT3-NEXT: v_mov_b32_e32 v0, s3 ; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2] -; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off -; VARIANT3-NEXT: v_mov_b32_e32 v5, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc +; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc ; VARIANT3-NEXT: s_barrier -; VARIANT3-NEXT: global_load_dword v0, v[0:1], off +; VARIANT3-NEXT: global_load_dword v0, v[3:4], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v[3:4], v0, off +; VARIANT3-NEXT: global_store_dword v[1:2], v0, off ; VARIANT3-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -1537,8 +1537,8 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0, ; SI-LABEL: simplify_bfe_u32_multi_use_arg: ; SI: ; %bb.0: -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -85,20 +85,20 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_movk_i32 s9, 0xfc01 -; SI-NEXT: s_mov_b32 s5, 0xfffff -; SI-NEXT: s_mov_b32 s4, -1 +; SI-NEXT: s_movk_i32 s11, 0xfc01 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: s_mov_b32 s8, -1 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 -; SI-NEXT: s_brev_b32 s8, -2 +; SI-NEXT: s_brev_b32 s10, -2 ; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: s_mov_b32 s7, 0x80000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v10, vcc, s9, v4 -; SI-NEXT: v_lshr_b64 v[4:5], s[4:5], v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, s11, v4 +; SI-NEXT: v_lshr_b64 v[4:5], s[8:9], v10 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, -1, v10 ; SI-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v11, s8, v8, v3 +; SI-NEXT: v_bfi_b32 v11, s10, v8, v3 ; SI-NEXT: v_and_b32_e32 v9, v3, v5 ; SI-NEXT: v_and_b32_e32 v8, v2, v4 ; SI-NEXT: v_lshr_b64 v[6:7], s[6:7], v10 @@ -122,26 +122,26 @@ ; ; CI-LABEL: v_round_f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_brev_b32 s2, -2 -; CI-NEXT: v_mov_b32_e32 v5, 0x3ff00000 -; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_brev_b32 s6, -2 +; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_trunc_f64_e32 v[6:7], v[2:3] -; CI-NEXT: v_add_f64 v[8:9], v[2:3], -v[6:7] -; CI-NEXT: v_bfi_b32 v2, s2, v5, v3 -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] -; CI-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc -; CI-NEXT: v_add_f64 v[2:3], v[6:7], v[4:5] -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] +; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] +; CI-NEXT: v_bfi_b32 v2, s6, v8, v3 +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] +; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -519,11 +519,11 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, 0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: ds_write_b16 v3, v0 -; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX900-NEXT: ds_write_b16 v2, v0 +; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -532,11 +532,11 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v3, 0 -; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: v_mov_b32_e32 v2, 0 +; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: ds_write_b16 v3, v0 -; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 +; GFX906-NEXT: ds_write_b16 v2, v0 +; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -580,10 +580,10 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 +; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX906-NEXT: v_mov_b32_e32 v3, 0 +; GFX906-NEXT: ds_write_b16 v3, v2 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX906-NEXT: v_mov_b32_e32 v4, 0 -; GFX906-NEXT: ds_write_b16 v4, v3 ; GFX906-NEXT: s_waitcnt lgkmcnt(1) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off @@ -596,11 +596,11 @@ ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX803-NEXT: v_mov_b32_e32 v3, 0 -; GFX803-NEXT: ds_write_b16 v3, v1 +; GFX803-NEXT: v_mov_b32_e32 v2, 0 +; GFX803-NEXT: ds_write_b16 v2, v1 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -618,12 +618,12 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: ds_read_u16 v0, v0 -; GFX900-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: ds_write_b16 v2, v0 -; GFX900-NEXT: ds_write_b16 v3, v5 -; GFX900-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX900-NEXT: ds_write_b16 v3, v4 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX900-NEXT: global_store_dword v[0:1], v0, off ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -632,12 +632,12 @@ ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX906-NEXT: ds_read_u16 v0, v0 -; GFX906-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) ; GFX906-NEXT: ds_write_b16 v2, v0 -; GFX906-NEXT: ds_write_b16 v3, v5 -; GFX906-NEXT: v_bfi_b32 v0, v4, v0, v1 +; GFX906-NEXT: ds_write_b16 v3, v4 +; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 ; GFX906-NEXT: global_store_dword v[0:1], v0, off ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX906-NEXT: s_setpc_b64 s[30:31] @@ -648,11 +648,11 @@ ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 ; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX803-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX803-NEXT: s_waitcnt lgkmcnt(0) ; GFX803-NEXT: ds_write_b16 v2, v0 ; GFX803-NEXT: ds_write_b16 v3, v1 -; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1035,10 +1035,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1117,10 +1117,10 @@ ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: flat_load_ubyte v0, v[0:1] -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1524,11 +1524,11 @@ ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] @@ -1606,11 +1606,11 @@ ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 -; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll --- a/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll +++ b/llvm/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -112,22 +112,22 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) { ; GCN-LABEL: muli24_shl64: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] ; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v4, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1 +; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0 ; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 -; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -12,10 +12,10 @@ ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -74,51 +74,51 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -129,7 +129,7 @@ ; CI-NEXT: v_lshr_b32_e32 v3, v4, v5 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -153,13 +153,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: @@ -171,39 +171,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v2, s0, v3 -; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s8, 0xffff ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 +; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3 -; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, s10, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -228,13 +228,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: @@ -246,39 +246,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s0 -; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_s_v_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s8, 0xffff ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 +; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_lshr_b32 s9, s8, 16 -; CI-NEXT: s_mov_b32 s10, 0xffff -; CI-NEXT: s_and_b32 s8, s8, s10 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_and_b32_e32 v2, s10, v2 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshr_b32_e32 v3, s9, v3 -; CI-NEXT: v_lshr_b32_e32 v2, s8, v2 +; CI-NEXT: v_lshr_b32_e32 v2, s10, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -302,46 +302,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_imm_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 8 +; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e64 v2, v4, 8 -; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -349,7 +349,7 @@ ; CI-NEXT: v_lshr_b32_e32 v2, 8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -370,13 +370,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v2i16: @@ -387,32 +387,32 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 -; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: lshr_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -428,60 +428,60 @@ ; GFX9-LABEL: v_lshr_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1 +; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0 +; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_lshr_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 @@ -500,7 +500,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -557,22 +557,22 @@ ; ; CI-LABEL: lshr_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s8, 0xff00ff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 ; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -16,14 +16,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v5, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v0, v0, v1 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_i16: @@ -38,14 +38,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v5, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -73,16 +73,16 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v3, v5, v2 -; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v2, v0, v1 +; VI-NEXT: v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v2i16: @@ -97,14 +97,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_i16 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid @@ -124,35 +124,35 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6 -; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc -; VI-NEXT: flat_load_ushort v8, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v8, v[6:7] ; VI-NEXT: flat_load_dword v9, v[0:1] ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] -; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4 +; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; VI-NEXT: v_max_i16_e32 v0, v8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_max_i16_e32 v2, v9, v1 ; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: flat_store_short v[4:5], v0 -; VI-NEXT: flat_store_dword v[6:7], v1 +; VI-NEXT: flat_store_short v[6:7], v0 +; VI-NEXT: flat_store_dword v[4:5], v1 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sge_v3i16: @@ -160,8 +160,8 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 @@ -169,19 +169,18 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v7, v[0:1], off -; GFX9-NEXT: global_load_short_d16 v5, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_pk_max_i16 v3, v6, v5 +; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4 +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v7, v2 -; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_max_i16 v1, v7, v6 +; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid @@ -272,14 +271,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_i16_e32 v2, v5, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_i16_e32 v0, v0, v1 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_imax_sgt_i16: @@ -294,14 +293,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v2, v5, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -329,14 +328,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v5, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v0, v0, v1 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_uge_i16: @@ -351,14 +350,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v5, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -386,14 +385,14 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v5, v[0:1] -; VI-NEXT: flat_load_ushort v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: flat_load_ushort v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v2, v5, v2 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v0, v0, v1 +; VI-NEXT: flat_store_short v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_i16: @@ -408,14 +407,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v5, v[0:1], off -; GFX9-NEXT: global_load_ushort v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: global_load_ushort v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v2, v5, v2 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 +; GFX9-NEXT: global_store_short v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -442,16 +441,16 @@ ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[2:3] +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_u16_e32 v3, v5, v2 -; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_max_u16_e32 v2, v0, v1 +; VI-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v2, v0 +; VI-NEXT: flat_store_dword v[4:5], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_umax_ugt_v2i16: @@ -466,14 +465,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v5, v[0:1], off -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v2, v5, v2 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_u16 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir --- a/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-onlydbg-value-empty-region.mir @@ -25,34 +25,34 @@ ; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec ; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec - ; CHECK: undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec - ; CHECK: %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec + ; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec ; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF - ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec ; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec + ; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec + ; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF ; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec + ; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec ; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec - ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec ; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec - ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, 0, implicit $exec - ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec - ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, 0, implicit $exec - ; CHECK: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]] - ; CHECK: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec + ; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec + ; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF1]], implicit $exec + ; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec + ; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]] + ; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: S_SETREG_IMM32_B32 0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -39,6 +39,8 @@ ; CHECK: successors: %bb.1(0x80000000) ; CHECK: INLINEASM &"", 1, 851978, def dead %11 ; CHECK: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3) ; CHECK: INLINEASM &"def $0 $1", 1, 851978, def %15, 851978, def %16 ; CHECK: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec @@ -46,24 +48,22 @@ ; CHECK: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec ; CHECK: INLINEASM &"def $0 $1", 1, 851978, def %21, 851978, def %22 ; CHECK: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] - ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec - ; CHECK: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec - ; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] - ; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: INLINEASM &"", 1, 851978, def dead [[V_MOV_B32_e32_2]], 851978, def dead [[V_MOV_B32_e32_3]], 851977, [[DS_READ_B64_gfx9_]].sub0, 2147483657, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193, [[V_MOV_B32_e32_3]](tied-def 5), 851977, %15, 851977, %16, 851977, [[DS_READ_B32_gfx9_1]], 851977, [[DS_READ_B32_gfx9_]], 851977, [[DS_READ_B32_gfx9_3]], 851977, [[DS_READ_B32_gfx9_2]] + ; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] ; CHECK: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3) ; CHECK: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3) ; CHECK: DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3) ; CHECK: undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4) + ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]] ; CHECK: [[V_MUL_LO_U32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_ADD_U32_e32_]], [[S_MOV_B32_]], implicit $exec + ; CHECK: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec ; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec ; CHECK: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_]], [[DEF1]], implicit $exec ; CHECK: [[V_MUL_LO_U32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_CNDMASK_B32_e64_]], [[S_MOV_B32_]], implicit $exec ; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_SUB_U32_e32_]], [[DEF]].sub0, implicit $exec ; CHECK: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_1]], [[V_MUL_LO_U32_]], implicit $exec - ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 ; CHECK: [[DEF]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec ; CHECK: undef %38.sub0:vreg_64, %39:sreg_64_xexec = V_ADD_I32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec ; CHECK: undef %40.sub1:vreg_64, dead %41:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, %39, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/scheduler-handle-move-bundle.mir @@ -19,10 +19,10 @@ ; GCN-LABEL: name: handleMove_bundle ; GCN: liveins: $sgpr4_sgpr5 ; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; GCN: $vcc_hi = IMPLICIT_DEF - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4) + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: $vcc_hi = IMPLICIT_DEF ; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3) ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec ; GCN: $m0 = S_MOV_B32 0 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1006,33 +1006,33 @@ ; ; GCN-IR-LABEL: s_test_sdiv24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe -; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s7, s2 -; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31 -; GCN-IR-NEXT: s_sext_i32_i16 s5, s3 -; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 -; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-IR-NEXT: s_mov_b32 s5, s4 -; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 24 -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s10, s8, s4 +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 24 +; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_subb_u32 s11, s9, s4 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 -; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 +; GCN-IR-NEXT: s_ashr_i32 s6, s5, 31 +; GCN-IR-NEXT: s_ashr_i64 s[12:13], s[4:5], 24 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[2:3], s[8:9] +; GCN-IR-NEXT: s_sub_u32 s10, s4, s2 +; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_subb_u32 s11, s5, s2 +; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[12:13] +; GCN-IR-NEXT: s_sub_u32 s8, s4, s6 +; GCN-IR-NEXT: s_subb_u32 s9, s5, s6 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8 ; GCN-IR-NEXT: s_add_i32 s0, s0, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s1, s7 +; GCN-IR-NEXT: s_flbit_i32_b32 s1, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s0 ; GCN-IR-NEXT: s_flbit_i32_b32 s0, s10 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s1 -; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 ; GCN-IR-NEXT: s_add_i32 s0, s0, 32 ; GCN-IR-NEXT: s_flbit_i32_b32 s1, s11 ; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc @@ -1042,7 +1042,7 @@ ; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3 ; GCN-IR-NEXT: v_subb_u32_e64 v1, s[0:1], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] @@ -1074,10 +1074,10 @@ ; GCN-IR-NEXT: BB9_4: ; %udiv-preheader ; GCN-IR-NEXT: v_not_b32_e32 v2, v2 ; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4 -; GCN-IR-NEXT: s_add_u32 s10, s6, -1 +; GCN-IR-NEXT: s_add_u32 s10, s8, -1 ; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: s_addc_u32 s11, s7, -1 +; GCN-IR-NEXT: s_addc_u32 s11, s9, -1 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 @@ -1092,9 +1092,9 @@ ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2 -; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8 +; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8 ; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8 -; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8 +; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc @@ -1112,16 +1112,16 @@ ; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0 ; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1 ; GCN-IR-NEXT: BB9_7: ; %udiv-end -; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] +; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3] ; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i48 %x, 24 %2 = ashr i48 %y, 24 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -281,28 +281,28 @@ define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { ; GCN-LABEL: shl_i16_computed_amount: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s14, 0 -; GCN-NEXT: s_mov_b32 s15, s7 -; GCN-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-NEXT: s_mov_b32 s15, s3 +; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 ; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 -; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; EG-LABEL: shl_i16_computed_amount: @@ -402,35 +402,35 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { ; GCN-LABEL: shl_v2i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s14, 0 -; GCN-NEXT: s_mov_b32 s15, s7 -; GCN-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-NEXT: s_mov_b32 s15, s3 +; GCN-NEXT: s_mov_b64 s[12:13], s[6:7] ; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0 ; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s0, 0xffff -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s4, 0xffff +; GCN-NEXT: s_mov_b32 s1, s5 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_and_b32_e32 v0, s4, v0 ; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0 ; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3 -; GCN-NEXT: v_and_b32_e32 v0, s0, v0 +; GCN-NEXT: v_and_b32_e32 v0, s4, v0 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; EG-LABEL: shl_v2i16: @@ -481,17 +481,17 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) { ; GCN-LABEL: shl_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] -; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] +; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 ; GCN-NEXT: s_mov_b32 s8, 0xffff -; GCN-NEXT: s_mov_b64 s[2:3], s[6:7] +; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -510,7 +510,7 @@ ; GCN-NEXT: v_and_b32_e32 v2, s8, v2 ; GCN-NEXT: v_or_b32_e32 v3, v3, v5 ; GCN-NEXT: v_or_b32_e32 v2, v2, v4 -; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm ; ; EG-LABEL: shl_v4i16: @@ -869,20 +869,20 @@ define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { ; GCN-LABEL: v_shl_32_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; GCN-NEXT: s_ashr_i32 s3, s2, 31 -; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], 3 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b64 s[0:1], s[6:7] -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 -; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-NEXT: s_mov_b64 s[4:5], s[10:11] +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_endpgm ; ; EG-LABEL: v_shl_32_i64: diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -73,51 +73,51 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v5, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: flat_load_dword v1, v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 -; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v4, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -128,7 +128,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -152,13 +152,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: @@ -170,39 +170,39 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_v_s_v2i16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dword s8, s[0:1], 0xd +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: s_mov_b32 s8, 0xffff ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_lshr_b32 s9, s0, 16 +; CI-NEXT: s_and_b32 s10, s0, s8 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 -; CI-NEXT: s_mov_b32 s9, 0xffff -; CI-NEXT: s_lshr_b32 s10, s8, 16 -; CI-NEXT: s_and_b32 s8, s8, s9 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3 -; CI-NEXT: v_and_b32_e32 v2, s9, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3 +; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 @@ -227,13 +227,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: @@ -245,17 +245,17 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_lshr_b32 s1, s0, 16 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_s_v_v2i16: @@ -270,12 +270,12 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s0, 0xffff -; CI-NEXT: s_lshr_b32 s1, s8, 16 +; CI-NEXT: s_lshr_b32 s9, s8, 16 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v3, s0, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 +; CI-NEXT: v_lshl_b32_e32 v2, s9, v2 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 @@ -301,46 +301,46 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_imm_v_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 8 +; VI-NEXT: v_mov_b32_e32 v4, 8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8 -; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_imm_v_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -349,7 +349,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -370,13 +370,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v2i16: @@ -387,33 +387,33 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 -; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 +; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: shl_v_imm_v2i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 -; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -429,60 +429,60 @@ ; GFX9-LABEL: v_shl_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 -; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 -; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1 +; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0 +; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 -; VI-NEXT: v_or_b32_e32 v0, v3, v0 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v5, v0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; CI-LABEL: v_shl_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 ; CI-NEXT: s_mov_b32 s8, 0xffff -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -501,7 +501,7 @@ ; CI-NEXT: v_and_b32_e32 v2, s8, v2 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -538,22 +538,22 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; VI-NEXT: s_mov_b32 s4, 0xff000000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: s_mov_b32 s0, 0xff000000 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: v_and_b32_e32 v0, s4, v0 +; VI-NEXT: v_and_b32_e32 v0, s0, v0 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_and_b32_e32 v4, s4, v4 +; VI-NEXT: v_and_b32_e32 v4, s0, v4 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -561,16 +561,16 @@ ; ; CI-LABEL: shl_v_imm_v4i16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-NEXT: s_mov_b32 s7, 0xf000 -; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, 0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b64 s[4:5], s[2:3] -; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: s_mov_b64 s[0:1], s[6:7] +; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; CI-NEXT: s_mov_b32 s8, 0xff00 -; CI-NEXT: s_mov_b64 s[2:3], s[6:7] +; CI-NEXT: s_mov_b64 s[6:7], s[2:3] ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 @@ -580,7 +580,7 @@ ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 -; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 +; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; CI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -32,13 +32,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_64: @@ -49,13 +49,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64: @@ -203,13 +203,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 64, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_64_sub_x: @@ -220,13 +220,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_64_sub_x: @@ -279,13 +279,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffbf, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_65: @@ -296,13 +296,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffbf, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffbf, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_65: @@ -355,13 +355,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_65_sub_x: @@ -372,13 +372,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 0x41, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 0x41, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_65_sub_x: @@ -431,13 +431,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg16: @@ -448,13 +448,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_neg16: @@ -507,13 +507,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, -16, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, -16, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg16_sub_x: @@ -524,13 +524,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, -16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, -16, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: @@ -583,13 +583,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, 17, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u32_e32 v0, vcc, 17, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_x_sub_neg17: @@ -600,13 +600,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 17, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_add_u32_e32 v0, 17, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_neg17: @@ -659,13 +659,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xffffffef, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i32_neg17_sub_x: @@ -676,13 +676,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v2, 0xffffffef, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_sub_u32_e32 v0, 0xffffffef, v0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: @@ -781,13 +781,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64: @@ -798,13 +798,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 -; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 +; GFX9-NEXT: global_store_short v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64: @@ -855,36 +855,36 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: flat_load_ushort v0, v[1:2] +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 +; GFX9-NEXT: global_load_ushort v0, v[1:2], off +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 +; GFX9-NEXT: global_store_dword v[3:4], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: @@ -1035,20 +1035,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 64 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: @@ -1059,13 +1059,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: @@ -1118,38 +1118,38 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 64 +; VI-NEXT: v_mov_b32_e32 v4, 64 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, -7, v4 -; VI-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, -7, v0 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x400007 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x400007 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: @@ -1202,38 +1202,38 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffff85 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xffffff85 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_subrev_u16_e32 v3, 64, v4 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v0, 64, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x7b0040 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: s_mov_b32 s0, 0x7b0040 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, s4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: @@ -1290,15 +1290,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, -7, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -7, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: @@ -1309,13 +1309,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 7 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 7 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: @@ -1365,19 +1365,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, -16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: @@ -1388,13 +1388,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: @@ -1443,19 +1443,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0: @@ -1466,13 +1466,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, -4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, -4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: @@ -1521,19 +1521,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffbc00 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0xffffbc00 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0: @@ -1544,13 +1544,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v2, v3, 4.0 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_i16 v0, v0, 4.0 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: @@ -1604,20 +1604,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_subrev_u16_e32 v3, 32, v4 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: @@ -1628,13 +1628,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: @@ -1683,19 +1683,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: @@ -1706,13 +1706,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: @@ -1768,15 +1768,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_subrev_u16_e32 v3, 32, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: @@ -1787,13 +1787,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: @@ -1847,20 +1847,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 +; VI-NEXT: v_mov_b32_e32 v4, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, -16, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, -16, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: @@ -1871,13 +1871,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: @@ -1926,19 +1926,19 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, -16 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, -16 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: @@ -1949,13 +1949,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: @@ -2011,15 +2011,15 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 -; VI-NEXT: v_add_u16_e32 v3, -16, v3 -; VI-NEXT: v_or_b32_e32 v2, v3, v2 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; VI-NEXT: v_add_u16_e32 v0, -16, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: @@ -2030,13 +2030,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 16 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: @@ -2089,20 +2089,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffc400 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffc400, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffc400, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -2113,13 +2113,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone: @@ -2172,20 +2172,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x4400 +; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 4.0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 4.0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -2196,13 +2196,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, -1.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -1.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone: @@ -2255,20 +2255,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 2.0, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 2.0, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: @@ -2279,13 +2279,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, -2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, -2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: @@ -2338,20 +2338,20 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 0xffffc000 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0xffffc000, v4 -; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v2, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_add_u16_e32 v1, 0xffffc000, v0 +; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: @@ -2362,13 +2362,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 2.0 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: @@ -2418,18 +2418,18 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-NEXT: v_mov_b32_e32 v3, 32 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 32 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: @@ -2440,13 +2440,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: @@ -2500,13 +2500,13 @@ ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_subrev_u16_e32 v2, 32, v3 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 +; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef: @@ -2517,13 +2517,13 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 +; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef: diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -294,9 +294,9 @@ ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s2 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -309,9 +309,9 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0 ; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1185,25 +1185,25 @@ ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb -; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xc +; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc ; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_sext_i32_i16 s5, s3 -; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31 -; GCN-IR-NEXT: s_sext_i32_i16 s3, s6 +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 ; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24 +; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 ; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31 +; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24 +; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31 ; GCN-IR-NEXT: s_mov_b32 s3, s2 -; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 24 -; GCN-IR-NEXT: s_mov_b32 s11, s10 +; GCN-IR-NEXT: s_mov_b32 s5, s4 ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s6, s6, s2 ; GCN-IR-NEXT: s_subb_u32 s7, s7, s2 -; GCN-IR-NEXT: s_sub_u32 s8, s4, s10 -; GCN-IR-NEXT: s_subb_u32 s9, s5, s10 +; GCN-IR-NEXT: s_sub_u32 s8, s8, s4 +; GCN-IR-NEXT: s_subb_u32 s9, s9, s4 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8 ; GCN-IR-NEXT: s_add_i32 s0, s0, 32 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -8,10 +8,10 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; @@ -53,24 +53,24 @@ ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v2, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: s_and_b32 s3, s1, 0xffff ; FIJI-NEXT: s_add_u32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 ; FIJI-NEXT: s_addc_u32 s1, s5, 0 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 ; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; FIJI-NEXT: ds_write_b16 v2, v3 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v3, s2 +; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 ; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b16 v2, v1 offset:4 ; FIJI-NEXT: ds_write_b8 v2, v0 offset:6 ; FIJI-NEXT: ds_write_b32 v2, v3 ; FIJI-NEXT: s_endpgm @@ -109,9 +109,9 @@ ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: ds_write_b16 v0, v2 offset:4 ; HAWAII-NEXT: ds_write_b32 v0, v1 ; HAWAII-NEXT: s_endpgm ; @@ -123,9 +123,9 @@ ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: ds_write_b16 v0, v2 offset:4 ; FIJI-NEXT: ds_write_b32 v0, v1 ; FIJI-NEXT: s_endpgm ; @@ -136,10 +136,10 @@ ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_write_b16 v0, v2 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: ds_write_b32 v0, v2 ; GFX9-NEXT: s_endpgm store i48 %arg, i48 addrspace(3)* %ptr, align 8 ret void @@ -154,11 +154,11 @@ ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: s_and_b32 s0, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s0 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 ; HAWAII-NEXT: ds_write_b64 v2, v[0:1] ; HAWAII-NEXT: s_endpgm ; @@ -170,11 +170,11 @@ ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: s_and_b32 s0, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s0 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 ; FIJI-NEXT: ds_write_b64 v2, v[0:1] ; FIJI-NEXT: s_endpgm ; @@ -186,9 +186,9 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: ds_write_b8 v2, v3 offset:8 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] ; GFX9-NEXT: s_endpgm @@ -222,9 +222,9 @@ ; CIVI: ; %bb.0: ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 ; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -7,7 +7,7 @@ ; GFX9-LABEL: v_test_sub_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -15,8 +15,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -30,7 +30,7 @@ ; VI-LABEL: v_test_sub_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -38,8 +38,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] @@ -166,42 +166,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -216,42 +216,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_neg_constant: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_neg_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0x3df -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -265,41 +265,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0] -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v2, 1 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u16_e32 v1, 1, v0 ; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -313,40 +313,40 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -361,41 +361,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { ; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_mov_b32 s8, 1.0 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s4, 1.0 +; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_test_sub_v2i16_inline_fp_split: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -411,7 +411,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -419,8 +419,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -436,7 +436,7 @@ ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -444,8 +444,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] @@ -473,7 +473,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -481,14 +481,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v[4:5], off ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 @@ -500,26 +500,26 @@ ; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; VI-NEXT: flat_load_dword v4, v[0:1] ; VI-NEXT: flat_load_dword v2, v[2:3] -; VI-NEXT: flat_load_dword v4, v[4:5] -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_sub_u16_e32 v0, v2, v4 -; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_sub_u16_e32 v0, v4, v2 +; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -539,7 +539,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -547,8 +547,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -564,7 +564,7 @@ ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -572,8 +572,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] @@ -603,7 +603,7 @@ ; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 @@ -611,8 +611,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: global_load_dword v1, v[2:3], off @@ -631,7 +631,7 @@ ; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 @@ -639,8 +639,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: flat_load_dword v1, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -704,41 +704,41 @@ ; GCN-NEXT: s_load_dword s3, s[0:1], 0xe ; GCN-NEXT: s_mov_b32 s5, 0xff000000 ; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s2, s2, s5 ; GCN-NEXT: s_and_b32 s3, s3, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 ; GCN-NEXT: s_load_dword s6, s[0:1], 0xb ; GCN-NEXT: s_load_dword s7, s[0:1], 0xc ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 ; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0 -; GCN-NEXT: v_rcp_f32_e32 v0, v2 +; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 +; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s7, s7, s4 ; GCN-NEXT: s_and_b32 s6, s6, s5 ; GCN-NEXT: s_sub_u32 s8, 0, s2 -; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 +; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_subb_u32 s9, 0, s3 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mul_lo_u32 v3, s8, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s8, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v5, s9, v1 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s8, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -749,18 +749,18 @@ ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 +; GCN-NEXT: v_add_i32_e64 v1, s[2:3], v1, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[2:3] ; GCN-NEXT: v_mul_lo_u32 v5, s8, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s8, v0 -; GCN-NEXT: v_mul_lo_u32 v7, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v6, s8, v1 +; GCN-NEXT: v_mul_lo_u32 v7, s9, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_mul_lo_u32 v6, s8, v0 +; GCN-NEXT: v_mul_lo_u32 v6, s8, v1 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_mul_lo_u32 v11, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v13, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v12, v0, v6 +; GCN-NEXT: v_mul_lo_u32 v11, v1, v5 +; GCN-NEXT: v_mul_hi_u32 v13, v1, v5 +; GCN-NEXT: v_mul_hi_u32 v12, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v3, v5 @@ -774,50 +774,50 @@ ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[2:3] -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_hi_u32 v5, v3, v0 +; GCN-NEXT: v_mul_hi_u32 v5, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v2 -; GCN-NEXT: v_mul_hi_u32 v0, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v1, 0, v1 ; GCN-NEXT: v_mul_hi_u32 v2, 0, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v1, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v0 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v0 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v1 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v1 ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v1 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0 ; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 ; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0 +; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v1 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v1 +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v1, -1, v1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc +; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc +; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v5, s[0:1] +; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -827,15 +827,15 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb ; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xd +; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd ; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_mov_b32 s4, 0xffff -; GCN-IR-NEXT: s_mov_b32 s6, 0xff000000 +; GCN-IR-NEXT: s_mov_b32 s7, 0xff000000 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_and_b32 s3, s3, s4 -; GCN-IR-NEXT: s_and_b32 s2, s2, s6 +; GCN-IR-NEXT: s_and_b32 s2, s2, s7 ; GCN-IR-NEXT: s_and_b32 s5, s5, s4 -; GCN-IR-NEXT: s_and_b32 s4, s7, s6 +; GCN-IR-NEXT: s_and_b32 s4, s6, s7 ; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[2:3], 24 ; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -108,12 +108,12 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v2, v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 @@ -128,11 +128,11 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -415,12 +415,11 @@ ; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -809,14 +808,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm %ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16 diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -272,17 +272,17 @@ ; VI-LABEL: no_widen_i16_constant_divergent_load: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 -; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; VI-NEXT: flat_load_ushort v2, v[2:3] +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v2 -; VI-NEXT: v_or_b32_e32 v2, 4, v2 +; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0 +; VI-NEXT: v_or_b32_e32 v2, 4, v0 +; VI-NEXT: v_mov_b32_e32 v0, 0 +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -155,12 +155,12 @@ ; ; GFX10-LABEL: xor3_uniform_vgpr: ; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0 ; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0 -; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0 -; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4 +; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %a1 = fadd float %a, 1.0 %b2 = fadd float %b, 2.0