diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1763,6 +1763,27 @@ } } + // Narrow shift to lower half - similar to ShrinkDemandedOp. + // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K)) + if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < (BitWidth / 2) && + (DemandedBits.countLeadingZeros() >= (BitWidth / 2) || + TLO.DAG.MaskedValueIsZero( + Op0, APInt::getHighBitsSet(BitWidth, ShAmt + (BitWidth / 2))))) { + EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2); + if (isNarrowingProfitable(VT, HalfVT) && + isTypeDesirableForOp(ISD::SHL, HalfVT) && + isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) && + (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, VT))) { + SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0); + SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant( + ShAmt, HalfVT, dl, TLO.LegalTypes()); + SDValue NewShift = + TLO.DAG.getNode(ISD::SHL, dl, HalfVT, NewOp, NewShiftAmt); + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift)); + } + } + APInt InDemandedMask = DemandedBits.lshr(ShAmt); if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -4669,49 +4669,49 @@ ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_and_b32 s4, s6, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f ; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: s_bfe_u32 s4, s8, 0xf000f ; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 ; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: s_bfe_u32 s5, s6, 0xf000f ; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 +; GFX6-NEXT: v_mad_f32 v6, -v1, v5, v6 +; GFX6-NEXT: v_cvt_u32_f32_e32 v7, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, v0, v4 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v1 ; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v7, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: udiv_v3i15: @@ -4724,48 +4724,48 @@ ; GFX9-NEXT: s_and_b32 s1, s2, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: s_bfe_u32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v3, s2 ; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_mad_f32 v7, -v1, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, v0, v5 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v1 ; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v5, vcc ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v8, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = udiv <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -4850,63 +4850,63 @@ ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s6 -; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX6-NEXT: s_and_b32 s7, s8, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_and_b32 s10, s8, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: s_bfe_u32 s5, s8, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 +; GFX6-NEXT: s_bfe_u32 s9, s8, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s9 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_alignbit_b32 v0, s7, v0, 30 +; GFX6-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_alignbit_b32 v2, s9, v2, 30 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s6, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 +; GFX6-NEXT: v_sub_i32_e32 v7, vcc, s6, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 ; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_f32_e32 v8, v4, v8 +; GFX6-NEXT: v_trunc_f32_e32 v8, v8 +; GFX6-NEXT: v_cvt_u32_f32_e32 v9, v8 +; GFX6-NEXT: v_mad_f32 v4, -v8, v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 +; GFX6-NEXT: s_lshr_b32 s5, s8, 15 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 -; GFX6-NEXT: s_lshr_b32 s5, s8, 15 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: s_mov_b32 s0, s4 ; GFX6-NEXT: s_lshr_b32 s4, s6, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v7 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: urem_v3i15: @@ -4915,60 +4915,60 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 -; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2 +; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: s_and_b32 s8, s0, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 +; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: s_bfe_u32 s7, s6, 0xf000f +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 ; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 ; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 -; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v4 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_f32 v7, -v5, v6, v7 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v9, v8, v9 +; GFX9-NEXT: v_trunc_f32_e32 v9, v9 +; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 +; GFX9-NEXT: v_mad_f32 v8, -v9, v4, v8 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v8|, v4 ; GFX9-NEXT: s_lshr_b32 s1, s0, 15 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v10, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v4, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 -; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX9-NEXT: s_lshr_b32 s0, s6, 15 -; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s6, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: s_lshr_b32 s2, s6, 15 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v1 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = urem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5093,35 +5093,35 @@ ; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| ; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX6-NEXT: v_mul_f32_e32 v1, v4, v6 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v5, -v1, v2, v5 +; GFX6-NEXT: v_mad_f32 v4, -v1, v2, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v3i15: @@ -5160,36 +5160,36 @@ ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 ; GFX9-NEXT: s_or_b32 s2, s0, 1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| ; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 -; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 -; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 +; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v5 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| ; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 +; GFX9-NEXT: s_cselect_b32 s0, s2, 0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_add_u32_e32 v3, s0, v6 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = sdiv <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -5325,40 +5325,40 @@ ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s6, v4 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v2 ; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 -; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 ; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 +; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 +; GFX6-NEXT: v_mul_f32_e32 v2, v6, v8 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 +; GFX6-NEXT: v_mad_f32 v6, -v2, v5, v6 ; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| +; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v5 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v7 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s9 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v2 +; GFX6-NEXT: v_and_b32_e32 v1, 0x1fff, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v3i15: @@ -5369,74 +5369,74 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf0000 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: s_lshr_b32 s8, s6, 15 -; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 +; GFX9-NEXT: v_trunc_f32_e32 v7, v7 +; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 +; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 ; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30 ; GFX9-NEXT: v_alignbit_b32 v1, s3, v1, 30 ; GFX9-NEXT: s_lshr_b32 s3, s2, 15 ; GFX9-NEXT: s_or_b32 s7, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cselect_b32 s0, s7, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 +; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 ; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 +; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s0 ; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 +; GFX9-NEXT: v_cvt_f32_i32_e32 v7, s1 ; GFX9-NEXT: s_xor_b32 s0, s1, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v1 ; GFX9-NEXT: s_ashr_i32 s0, s0, 30 ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX9-NEXT: v_mul_f32_e32 v8, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v8, v8 +; GFX9-NEXT: v_mad_f32 v7, -v8, v6, v7 +; GFX9-NEXT: v_mul_lo_u32 v5, v5, s2 ; GFX9-NEXT: s_or_b32 s2, s0, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, |v6| ; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v0 +; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 +; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 +; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v8 +; GFX9-NEXT: v_mul_f32_e32 v1, v7, v9 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v1 +; GFX9-NEXT: v_mad_f32 v1, -v1, v6, v7 +; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v6| +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX9-NEXT: v_add_u32_e32 v0, v9, v0 ; GFX9-NEXT: s_cselect_b32 s0, s2, 0 -; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 -; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 -; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 -; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 -; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX9-NEXT: v_add_u32_e32 v1, s0, v8 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 +; GFX9-NEXT: v_sub_u32_e32 v0, v3, v0 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v3, s8, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1fff, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-NEXT: global_store_short v2, v1, s[4:5] offset:4 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9-NEXT: global_store_dword v2, v0, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 ; GFX9-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -37,44 +37,43 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc + ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr10 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF - ; GFX90A-NEXT: $agpr0 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr26 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr18 = IMPLICIT_DEF ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.58, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58, $sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr27 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec + ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr1, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, $vgpr4, implicit $exec + ; GFX90A-NEXT: renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -82,7 +81,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 @@ -95,9 +94,9 @@ ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -105,32 +104,32 @@ ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18 = COPY $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr20 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr23 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr22 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr25 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $vgpr24 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr27 = COPY $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26 = COPY $sgpr17, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec @@ -138,7 +137,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec @@ -147,15 +146,15 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr56_sgpr57, implicit-def $exec, implicit-def $scc, implicit $exec @@ -164,15 +163,15 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec @@ -181,10 +180,10 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) - ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) + ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: @@ -366,7 +365,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec @@ -383,37 +382,37 @@ ; GFX90A-NEXT: renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec @@ -421,29 +420,29 @@ ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -464,7 +463,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec @@ -472,28 +471,28 @@ ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -513,7 +512,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -522,33 +521,33 @@ ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr20, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc @@ -565,7 +564,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc @@ -573,33 +572,33 @@ ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc - ; GFX90A-NEXT: $agpr0 = IMPLICIT_DEF - ; GFX90A-NEXT: $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr10 = IMPLICIT_DEF + ; GFX90A-NEXT: $vgpr12 = IMPLICIT_DEF ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr18, $vgpr30, $vgpr31, $vgpr61, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr10, $vgpr12 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc @@ -615,7 +614,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc @@ -629,26 +628,26 @@ ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc @@ -666,133 +665,135 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55, $sgpr46_sgpr47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec + ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57, $sgpr54_sgpr55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr9, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr58_sgpr59:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) - ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1) + ; GFX90A-NEXT: renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr5, dead renamable $sgpr56_sgpr57 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $agpr1 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr17 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr58, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr5, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr58, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr59, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 @@ -803,9 +804,9 @@ ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF @@ -815,12 +816,12 @@ ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr30 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr54 = COPY renamable $vgpr19, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = COPY renamable $vgpr17, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr15, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} @@ -829,15 +830,15 @@ ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr58_sgpr59:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr23 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 @@ -845,38 +846,37 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr58_sgpr59:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) + ; GFX90A-NEXT: renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr17 = S_MOV_B32 0 ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr30 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr54 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc @@ -888,7 +888,7 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc @@ -896,122 +896,120 @@ ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.67, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = COPY killed renamable $agpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr3, killed $vgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr54, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr30, killed $vgpr15, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_CNDMASK_B32_e64 0, $vgpr36, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr32, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = COPY renamable $agpr0_agpr1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, killed $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr14, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr34, killed $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec + ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr3, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr29 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec + ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 bb: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -449,14 +449,14 @@ ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] +; GCN-O0-NEXT: s_mov_b32 s4, 2 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s4, v1 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 -; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 @@ -684,15 +684,14 @@ ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 ; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b32 s0, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr0 -; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: s_mov_b32 s1, s0 -; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s1 +; GCN-O0-NEXT: v_lshlrev_b32_e64 v3, s0, v1 +; GCN-O0-NEXT: s_mov_b32 s1, 0 +; GCN-O0-NEXT: ; implicit-def: $sgpr1 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 +; GCN-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v4, v2 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b32 s2, s4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -646,10 +646,8 @@ ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s6, 0x400 -; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 @@ -657,22 +655,19 @@ ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_and_b32 s4, 0xffff, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_add_u16_e64 v3, s7, 1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v3 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v3 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[4:5], 1 +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 +; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s8, s2, s0 -; GFX9-NEXT: v_mad_f32 v3, -v3, v0, v4 -; GFX9-NEXT: s_addc_u32 s9, s3, s1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, v0 -; GFX9-NEXT: v_addc_co_u32_e64 v3, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: global_store_short v2, v3, s[8:9] +; GFX9-NEXT: global_store_short v3, v4, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB4_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -683,30 +678,25 @@ ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s0, s4, 0xffff -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX10-NEXT: v_add_nc_u16 v3, s4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s4 -; GFX10-NEXT: v_readfirstlane_b32 s4, v3 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX10-NEXT: s_addc_u32 s7, s3, s5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo -; GFX10-NEXT: v_trunc_f32_e32 v3, v3 -; GFX10-NEXT: v_mad_f32 v4, -v3, v0, v4 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX10-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v3, s0 -; GFX10-NEXT: global_store_short v2, v3, s[6:7] +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 +; GFX10-NEXT: global_store_short v3, v4, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB4_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -717,36 +707,31 @@ ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s0, s4, 0xffff -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB4_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_and_b32 s0, 0xffff, s4 -; GFX11-NEXT: v_add_nc_u16 v3, s4, 1 -; GFX11-NEXT: v_cvt_f32_u32_e32 v4, s0 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_add_u32 s6, s2, s4 -; GFX11-NEXT: v_readfirstlane_b32 s4, v3 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 1, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v4, v1 -; GFX11-NEXT: s_addc_u32 s7, s3, s5 +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 ; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v3, v3 -; GFX11-NEXT: v_fma_f32 v4, -v3, v0, v4 -; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 +; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v4|, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s0, 0, v3, s0 -; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] +; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v5, s0 +; GFX11-NEXT: global_store_b16 v3, v4, s[2:3] ; GFX11-NEXT: s_cbranch_vccz .LBB4_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -773,33 +758,31 @@ ; GFX9-LABEL: urem16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s7, 0x400 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_movk_i32 s5, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s6, s2, 0xffff -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX9-NEXT: s_and_b32 s4, s2, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 -; GFX9-NEXT: v_mov_b32_e32 v7, s5 -; GFX9-NEXT: v_mul_f32_e32 v9, v8, v3 -; GFX9-NEXT: v_trunc_f32_e32 v9, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 -; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 -; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6 -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 -; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 -; GFX9-NEXT: global_store_short v[5:6], v0, off +; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX9-NEXT: v_add_u16_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX9-NEXT: v_mul_f32_e32 v6, v4, v1 +; GFX9-NEXT: v_trunc_f32_e32 v6, v6 +; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX9-NEXT: v_mad_f32 v4, -v6, v0, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, v0 +; GFX9-NEXT: s_and_b64 vcc, exec, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1] +; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_short v5, v3, s[2:3] ; GFX9-NEXT: s_cbranch_vccz .LBB5_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -809,30 +792,27 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s1, s4, 0xffff -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 +; GFX10-NEXT: s_and_b32 s0, s4, 0xffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] -; GFX10-NEXT: v_mul_f32_e32 v8, v7, v3 -; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX10-NEXT: v_trunc_f32_e32 v8, v8 -; GFX10-NEXT: v_mad_f32 v7, -v8, v2, v7 -; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 -; GFX10-NEXT: global_store_short v[5:6], v0, off +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX10-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX10-NEXT: v_trunc_f32_e32 v5, v5 +; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 +; GFX10-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX10-NEXT: v_mul_lo_u32 v4, v4, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, v3, v4 +; GFX10-NEXT: global_store_short v5, v3, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -840,42 +820,38 @@ ; GFX11-LABEL: urem16_invariant_denom: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c -; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s1, s4, 0xffff +; GFX11-NEXT: s_and_b32 s2, s2, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX11-NEXT: v_rcp_iflag_f32_e32 v3, v2 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB5_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; GFX11-NEXT: v_add_nc_u16 v4, v4, 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX11-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v2 +; GFX11-NEXT: v_add_nc_u16 v2, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v8, v7, v3 -; GFX11-NEXT: v_add_co_u32 v5, s0, s2, v5 -; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v8, v8 -; GFX11-NEXT: v_fma_f32 v7, -v8, v2, v7 -; GFX11-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v7, v7, s1 -; GFX11-NEXT: v_sub_nc_u32_e32 v0, v0, v7 -; GFX11-NEXT: global_store_b16 v[5:6], v0, off +; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 +; GFX11-NEXT: v_trunc_f32_e32 v5, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 +; GFX11-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v4|, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3 +; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4 +; GFX11-NEXT: global_store_b16 v5, v3, s[0:1] ; GFX11-NEXT: s_cbranch_vccz .LBB5_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 -; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -902,36 +878,33 @@ ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB6_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s2, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GFX9-NEXT: s_xor_b32 s7, s2, s4 -; GFX9-NEXT: s_ashr_i32 s2, s7, 30 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v4|, |v0| -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX9-NEXT: s_cselect_b32 s7, s2, 0 -; GFX9-NEXT: s_and_b32 s2, s6, 0xffff -; GFX9-NEXT: v_add_u16_e64 v3, s6, 1 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3 -; GFX9-NEXT: s_add_u32 s8, s0, s8 -; GFX9-NEXT: v_readfirstlane_b32 s6, v3 -; GFX9-NEXT: v_add_u32_e32 v3, s7, v5 -; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: global_store_short v2, v3, s[8:9] +; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 +; GFX9-NEXT: s_ashr_i32 s5, s6, 30 +; GFX9-NEXT: s_or_b32 s5, s5, 1 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 +; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s5, v4 +; GFX9-NEXT: s_lshl_b32 s5, s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -941,36 +914,33 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB6_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s0, s5 -; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s0 -; GFX10-NEXT: s_xor_b32 s0, s0, s4 -; GFX10-NEXT: s_ashr_i32 s0, s0, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX10-NEXT: s_or_b32 s0, s0, 1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s0, 0 -; GFX10-NEXT: s_and_b32 s0, s5, 0xffff -; GFX10-NEXT: v_readfirstlane_b32 s5, v3 -; GFX10-NEXT: v_add_nc_u32_e32 v3, s6, v4 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s6 -; GFX10-NEXT: s_addc_u32 s7, s3, s7 -; GFX10-NEXT: global_store_short v2, v3, s[6:7] +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 +; GFX10-NEXT: s_ashr_i32 s4, s5, 30 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: s_or_b32 s4, s4, 1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| +; GFX10-NEXT: s_and_b32 s5, s5, exec_lo +; GFX10-NEXT: s_cselect_b32 s4, s4, 0 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: global_store_short v3, v2, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB6_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -980,43 +950,39 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB6_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_sext_i32_i16 s2, s5 -; GFX11-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, s2 -; GFX11-NEXT: s_xor_b32 s2, s2, s4 +; GFX11-NEXT: s_sext_i32_i16 s4, s3 +; GFX11-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX11-NEXT: s_xor_b32 s5, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_ashr_i32 s2, s2, 30 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX11-NEXT: s_ashr_i32 s4, s5, 30 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX11-NEXT: s_or_b32 s2, s2, 1 +; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX11-NEXT: s_or_b32 s4, s4, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v5 -; GFX11-NEXT: s_and_b32 s6, s6, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s2, 0 -; GFX11-NEXT: s_and_b32 s2, s5, 0xffff -; GFX11-NEXT: v_readfirstlane_b32 s5, v3 -; GFX11-NEXT: v_add_nc_u32_e32 v3, s6, v4 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], 1 +; GFX11-NEXT: v_trunc_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3 +; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0| +; GFX11-NEXT: s_and_b32 s5, s5, exec_lo +; GFX11-NEXT: s_cselect_b32 s4, s4, 0 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_u32 s6, s0, s6 -; GFX11-NEXT: s_addc_u32 s7, s1, s7 -; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] +; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4 +; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] ; GFX11-NEXT: s_cbranch_vccz .LBB6_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_nop 0 @@ -1045,38 +1011,35 @@ ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_sext_i32_i16 s2, s2 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX9-NEXT: .LBB7_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: s_sext_i32_i16 s7, s6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 -; GFX9-NEXT: s_xor_b32 s2, s7, s4 -; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: s_or_b32 s2, s2, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[8:9], |v4|, |v0| -; GFX9-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX9-NEXT: v_add_u16_e64 v3, s6, 1 -; GFX9-NEXT: s_cselect_b32 s8, s2, 0 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v3 -; GFX9-NEXT: s_and_b32 s2, s6, 0xffff -; GFX9-NEXT: v_readfirstlane_b32 s6, v3 -; GFX9-NEXT: v_add_u32_e32 v3, s8, v5 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s4 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], 1 -; GFX9-NEXT: s_add_u32 s8, s0, s8 -; GFX9-NEXT: s_addc_u32 s9, s1, s9 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 -; GFX9-NEXT: global_store_short v2, v3, s[8:9] +; GFX9-NEXT: s_sext_i32_i16 s5, s3 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX9-NEXT: s_xor_b32 s6, s5, s2 +; GFX9-NEXT: s_ashr_i32 s6, s6, 30 +; GFX9-NEXT: s_or_b32 s8, s6, 1 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: v_add_u16_e64 v2, s3, 1 +; GFX9-NEXT: s_cselect_b32 s6, s8, 0 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v2 +; GFX9-NEXT: s_and_b32 s7, 0xffff, s3 +; GFX9-NEXT: v_readfirstlane_b32 s3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, s6, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 +; GFX9-NEXT: s_lshl_b32 s6, s7, 1 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 +; GFX9-NEXT: global_store_short v3, v2, s[0:1] ; GFX9-NEXT: s_cbranch_vccz .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %bb2 ; GFX9-NEXT: s_endpgm @@ -1086,38 +1049,36 @@ ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_mov_b32 s1, 0 -; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s4, s4 -; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX10-NEXT: s_sext_i32_i16 s0, s4 +; GFX10-NEXT: v_cvt_f32_i32_e32 v0, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX10-NEXT: .LBB7_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: s_sext_i32_i16 s8, s5 -; GFX10-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX10-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX10-NEXT: s_xor_b32 s0, s8, s4 -; GFX10-NEXT: s_ashr_i32 s0, s0, 30 -; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 -; GFX10-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX10-NEXT: s_or_b32 s0, s0, 1 -; GFX10-NEXT: v_trunc_f32_e32 v5, v5 -; GFX10-NEXT: v_mad_f32 v4, -v5, v0, v4 -; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX10-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: v_add_nc_u16 v2, s1, 1 +; GFX10-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX10-NEXT: s_xor_b32 s5, s4, s0 +; GFX10-NEXT: s_ashr_i32 s5, s5, 30 +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 +; GFX10-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX10-NEXT: s_or_b32 s5, s5, 1 +; GFX10-NEXT: v_trunc_f32_e32 v4, v4 +; GFX10-NEXT: v_mad_f32 v3, -v4, v0, v3 +; GFX10-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| +; GFX10-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s0, 0 -; GFX10-NEXT: s_and_b32 s0, s5, 0xffff -; GFX10-NEXT: v_add_nc_u32_e32 v4, s6, v4 -; GFX10-NEXT: v_readfirstlane_b32 s5, v3 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 -; GFX10-NEXT: s_add_u32 s6, s2, s6 -; GFX10-NEXT: v_mul_lo_u32 v3, v4, s4 -; GFX10-NEXT: s_addc_u32 s7, s3, s7 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX10-NEXT: global_store_short v2, v3, s[6:7] +; GFX10-NEXT: s_cselect_b32 s5, s5, 0 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX10-NEXT: v_add_nc_u32_e32 v3, s5, v3 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mul_lo_u32 v3, v3, s0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX10-NEXT: global_store_short v2, v3, s[2:3] ; GFX10-NEXT: s_cbranch_vccz .LBB7_1 ; GFX10-NEXT: ; %bb.2: ; %bb2 ; GFX10-NEXT: s_endpgm @@ -1127,47 +1088,45 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sext_i32_i16 s4, s2 +; GFX11-NEXT: s_sext_i32_i16 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-NEXT: .p2align 6 ; GFX11-NEXT: .LBB7_1: ; %bb3 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: s_sext_i32_i16 s8, s5 -; GFX11-NEXT: v_add_nc_u16 v3, s5, 1 -; GFX11-NEXT: v_cvt_f32_i32_e32 v4, s8 -; GFX11-NEXT: s_xor_b32 s2, s8, s4 +; GFX11-NEXT: s_sext_i32_i16 s4, s3 +; GFX11-NEXT: v_add_nc_u16 v2, s3, 1 +; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX11-NEXT: s_xor_b32 s5, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: s_ashr_i32 s2, s2, 30 -; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v3 +; GFX11-NEXT: s_ashr_i32 s5, s5, 30 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX11-NEXT: s_or_b32 s2, s2, 1 +; GFX11-NEXT: v_mul_f32_e32 v4, v3, v1 +; GFX11-NEXT: s_or_b32 s5, s5, 1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v5, v5 -; GFX11-NEXT: v_fma_f32 v4, -v5, v0, v4 +; GFX11-NEXT: v_trunc_f32_e32 v4, v4 +; GFX11-NEXT: v_fma_f32 v3, -v4, v0, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v4|, |v0| -; GFX11-NEXT: v_cvt_i32_f32_e32 v4, v5 +; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v3|, |v0| +; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s2, 0 -; GFX11-NEXT: s_and_b32 s2, s5, 0xffff -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add_nc_u32_e32 v4, s6, v4 -; GFX11-NEXT: v_readfirstlane_b32 s5, v3 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], 1 -; GFX11-NEXT: s_add_u32 s6, s0, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_lo_u32 v3, v4, s4 -; GFX11-NEXT: s_addc_u32 s7, s1, s7 -; GFX11-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX11-NEXT: global_store_b16 v2, v3, s[6:7] +; GFX11-NEXT: s_cselect_b32 s5, s5, 0 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v2 +; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s5 +; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3 +; GFX11-NEXT: global_store_b16 v2, v3, s[0:1] ; GFX11-NEXT: s_cbranch_vccz .LBB7_1 ; GFX11-NEXT: ; %bb.2: ; %bb2 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -28,138 +28,136 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] ; GFX8-NEXT: s_movk_i32 s0, 0x2000 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x2800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] ; GFX8-NEXT: s_movk_i32 s0, 0x3000 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x3800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x3800, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: clmem_read_simplified: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v18, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v18 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s1, 0x2000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 -; GFX900-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[6:7], off -; GFX900-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 -; GFX900-NEXT: s_movk_i32 s0, 0x3000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: clmem_read_simplified: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v18 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s1, 0x2000 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[6:7], off +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: clmem_read_simplified: ; GFX10: ; %bb.0: ; %entry @@ -181,15 +179,14 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v20 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x1000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v0, 0x2000 @@ -234,77 +231,6 @@ ; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: clmem_read_simplified: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v18, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v18 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s1, 0x2000 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2048 -; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, s1, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[10:11], off offset:2048 -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[6:7], off -; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 -; GFX90A-NEXT: s_movk_i32 s0, 0x3000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[0:1], off -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v3, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v12, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v13, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v18, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: clmem_read_simplified: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -316,17 +242,17 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v16 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[2:3], v[0:1], off ; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2048 @@ -436,98 +362,97 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX8-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v2, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 3 +; GFX8-NEXT: v_and_b32_e32 v1, 0xfe000000, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x5000 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_movk_i32 s0, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX8-NEXT: v_mov_b32_e32 v7, v3 ; GFX8-NEXT: v_mov_b32_e32 v6, v2 -; GFX8-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0xffffb000, v5 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffb800, v5 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v11, vcc, 0xffffc000, v5 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] -; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xffffc800, v5 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xffffd000, v5 -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[11:12] -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffd800, v5 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffe000, v5 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe800, v5 -; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[19:20] -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[21:22], v[21:22] -; GFX8-NEXT: v_add_u32_e32 v23, vcc, 0xfffff000, v5 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[23:24], v[23:24] -; GFX8-NEXT: v_add_u32_e32 v25, vcc, 0xfffff800, v5 -; GFX8-NEXT: v_addc_u32_e32 v26, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[25:26], v[25:26] -; GFX8-NEXT: flat_load_dwordx2 v[27:28], v[5:6] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x10000, v5 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffb000, v6 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffb800, v6 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffc000, v6 +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xffffc800, v6 +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xffffd000, v6 +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v18, vcc, 0xffffd800, v6 +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffe000, v6 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe800, v6 +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[22:23], v[22:23] +; GFX8-NEXT: v_add_u32_e32 v24, vcc, 0xfffff000, v6 +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[24:25], v[24:25] +; GFX8-NEXT: v_add_u32_e32 v26, vcc, 0xfffff800, v6 +; GFX8-NEXT: v_addc_u32_e32 v27, vcc, -1, v7, vcc +; GFX8-NEXT: flat_load_dwordx2 v[26:27], v[26:27] +; GFX8-NEXT: flat_load_dwordx2 v[28:29], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x10000, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GFX8-NEXT: s_addk_i32 s1, 0x2000 ; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX8-NEXT: s_waitcnt vmcnt(10) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v4 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(9) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v9, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v10, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v11, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v11, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v12, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v13, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v13, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v14, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v15, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v16, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v16, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v17, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v17, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v18, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v18, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v19, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v19, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v20, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v20, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v21, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v21, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v22, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v22, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v23, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v23, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v24, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v24, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v25, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v25, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v26, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v26, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v27, v4, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v27, v3 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v28, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v28, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v29, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -538,10 +463,10 @@ ; GFX8-NEXT: s_mov_b32 s0, s1 ; GFX8-NEXT: s_branch .LBB1_1 ; GFX8-NEXT: .LBB1_5: ; %while.end -; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -565,14 +490,12 @@ ; GFX900-NEXT: s_waitcnt lgkmcnt(0) ; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 17, v0 -; GFX900-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] ; GFX900-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX900-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v3, s35 +; GFX900-NEXT: v_lshl_or_b32 v1, v1, 3, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, s35 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 -; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v3, vcc +; GFX900-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX900-NEXT: s_movk_i32 s0, 0x5000 ; GFX900-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 @@ -682,17 +605,15 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 17, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 17, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_movk_i32 s1, 0x7f -; GFX10-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX10-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX10-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, s34 -; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v0, 0xfe000000, v1 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 3, v0 +; GFX10-NEXT: v_add_co_u32 v1, s0, v1, s34 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, 0, s35, s0 ; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, 0x5000, v1 ; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader @@ -796,15 +717,13 @@ ; GFX90A-NEXT: s_mov_b32 s32, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_and_b32_e32 v1, 0xff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 17, v0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 -; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] -; GFX90A-NEXT: v_or_b32_e32 v1, v2, v0 +; GFX90A-NEXT: v_lshl_or_b32 v1, v1, 3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v2, vcc +; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc ; GFX90A-NEXT: s_movk_i32 s0, 0x5000 ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc @@ -903,20 +822,18 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0 -; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 17, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 17, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0xff, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_movk_i32 s1, 0x7f -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b64 v[1:2], 3, v[1:2] -; GFX11-NEXT: v_and_b32_e32 v0, 0xfe000000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xfe000000, v1 +; GFX11-NEXT: v_lshl_or_b32 v1, v2, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v1, v1, v0 -; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, s34 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo +; GFX11-NEXT: v_add_co_u32 v1, s0, v1, s34 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, 0, s35, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, 0x5000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX11-NEXT: ; =>This Loop Header: Depth=1 @@ -1135,39 +1052,38 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 2 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x400 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xc00 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v12, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1400 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1c00 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x2000 -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: flat_load_dword v19, v[5:6] ; GFX8-NEXT: flat_load_dword v7, v[7:8] ; GFX8-NEXT: flat_load_dword v8, v[9:10] @@ -1175,90 +1091,89 @@ ; GFX8-NEXT: flat_load_dword v10, v[13:14] ; GFX8-NEXT: flat_load_dword v11, v[15:16] ; GFX8-NEXT: flat_load_dword v12, v[17:18] -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x2400, v0 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x2400, v3 ; GFX8-NEXT: flat_load_dword v5, v[5:6] -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(8) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v19, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v19, v0 ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v7, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0 ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v9, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v10, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v10, v0 ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v11, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v11, v0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v12, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v12, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: flat_store_dword v[1:2], v0 ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: Address32: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v4, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dword v5, v[0:1], off -; GFX900-NEXT: global_load_dword v6, v[0:1], off offset:1024 -; GFX900-NEXT: global_load_dword v7, v[0:1], off offset:2048 -; GFX900-NEXT: global_load_dword v8, v[0:1], off offset:3072 -; GFX900-NEXT: global_load_dword v9, v[2:3], off -; GFX900-NEXT: global_load_dword v10, v[2:3], off offset:1024 -; GFX900-NEXT: global_load_dword v11, v[2:3], off offset:2048 -; GFX900-NEXT: global_load_dword v12, v[2:3], off offset:3072 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dword v2, v[0:1], off -; GFX900-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX900-NEXT: s_waitcnt vmcnt(8) -; GFX900-NEXT: v_add_u32_e32 v0, v6, v5 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add3_u32 v0, v7, v0, v8 -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add3_u32 v0, v11, v0, v12 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add3_u32 v0, v2, v0, v3 -; GFX900-NEXT: global_store_dword v4, v0, s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: Address32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v4 +; GFX9-NEXT: v_mov_b32_e32 v3, 2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v5, v[0:1], off +; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:1024 +; GFX9-NEXT: global_load_dword v7, v[0:1], off offset:2048 +; GFX9-NEXT: global_load_dword v8, v[0:1], off offset:3072 +; GFX9-NEXT: global_load_dword v9, v[2:3], off +; GFX9-NEXT: global_load_dword v10, v[2:3], off offset:1024 +; GFX9-NEXT: global_load_dword v11, v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dword v12, v[2:3], off offset:3072 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v2, v[0:1], off +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:1024 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_u32_e32 v0, v6, v5 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add3_u32 v0, v7, v0, v8 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add3_u32 v0, v11, v0, v12 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add3_u32 v0, v2, v0, v3 +; GFX9-NEXT: global_store_dword v4, v0, s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Address32: ; GFX10: ; %bb.0: ; %entry @@ -1280,15 +1195,14 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0x1000 @@ -1327,64 +1241,6 @@ ; GFX10-NEXT: global_store_dword v8, v0, s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: Address32: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v4, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, s34, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: global_load_dword v6, v[0:1], off offset:1024 -; GFX90A-NEXT: global_load_dword v7, v[0:1], off offset:2048 -; GFX90A-NEXT: global_load_dword v8, v[0:1], off offset:3072 -; GFX90A-NEXT: global_load_dword v9, v[2:3], off -; GFX90A-NEXT: global_load_dword v10, v[2:3], off offset:1024 -; GFX90A-NEXT: global_load_dword v11, v[2:3], off offset:2048 -; GFX90A-NEXT: global_load_dword v12, v[2:3], off offset:3072 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dword v2, v[0:1], off -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_u32_e32 v0, v6, v5 -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add3_u32 v0, v7, v0, v8 -; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add3_u32 v0, v11, v0, v12 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add3_u32 v0, v2, v0, v3 -; GFX90A-NEXT: global_store_dword v4, v0, s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: Address32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -1396,17 +1252,17 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v6 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v7, v[0:1], off ; GFX11-NEXT: global_load_b32 v8, v[0:1], off offset:1024 @@ -1513,89 +1369,87 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf000 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0xf800 -; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 1, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 1, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v7 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v8, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: Offset64: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v12 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0xf000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: Offset64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v12 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 +; GFX9-NEXT: s_movk_i32 s0, 0xf000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: Offset64: ; GFX10: ; %bb.0: ; %entry @@ -1617,15 +1471,14 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v12 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v12 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0xfffff800 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: s_clause 0x1 @@ -1648,56 +1501,6 @@ ; GFX10-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: Offset64: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v12, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v12 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 1, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:-4096 -; GFX90A-NEXT: s_movk_i32 s0, 0xf000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[0:1], off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: Offset64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -1709,17 +1512,17 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v8, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v8 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, 0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo @@ -1794,81 +1597,80 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 2 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_mov_b32 s0, 0x7ffff800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_mov_b32 s0, 0x7ffffc00 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v0, v[3:4] ; GFX8-NEXT: flat_load_dword v5, v[5:6] ; GFX8-NEXT: flat_load_dword v6, v[7:8] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x80000000, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v5, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v6, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 +; GFX8-NEXT: flat_store_dword v[1:2], v0 ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: p32Offset64: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v6, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v6 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 2, v[1:2] -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dword v7, v[0:1], off -; GFX900-NEXT: global_load_dword v8, v[2:3], off offset:2048 -; GFX900-NEXT: global_load_dword v9, v[2:3], off offset:3072 -; GFX900-NEXT: global_load_dword v10, v[4:5], off -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_u32_e32 v0, v8, v7 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX900-NEXT: global_store_dword v6, v0, s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: p32Offset64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v6 +; GFX9-NEXT: v_mov_b32_e32 v3, 2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_mov_b32 s0, 0x7ffff000 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_dword v8, v[2:3], off offset:2048 +; GFX9-NEXT: global_load_dword v9, v[2:3], off offset:3072 +; GFX9-NEXT: global_load_dword v10, v[4:5], off +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, v8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add3_u32 v0, v9, v0, v10 +; GFX9-NEXT: global_store_dword v6, v0, s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: p32Offset64: ; GFX10: ; %bb.0: ; %entry @@ -1890,15 +1692,14 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v4 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 2 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v4 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 0x80000000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: global_load_dword v5, v[0:1], off @@ -1915,51 +1716,6 @@ ; GFX10-NEXT: global_store_dword v4, v0, s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: p32Offset64: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x80000000, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dword v7, v[0:1], off -; GFX90A-NEXT: global_load_dword v8, v[2:3], off offset:2048 -; GFX90A-NEXT: global_load_dword v9, v[2:3], off offset:3072 -; GFX90A-NEXT: global_load_dword v10, v[4:5], off -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_u32_e32 v0, v8, v7 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add3_u32 v0, v9, v0, v10 -; GFX90A-NEXT: global_store_dword v6, v0, s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: p32Offset64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -1971,17 +1727,17 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v6, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v6 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v6 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo @@ -2322,137 +2078,135 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x3800 -; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x3000 -; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x2800 -; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[11:12], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: flat_load_dwordx2 v[7:8], v[7:8] ; GFX8-NEXT: flat_load_dwordx2 v[9:10], v[9:10] ; GFX8-NEXT: s_movk_i32 s0, 0x2000 -; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v14, vcc, 0, v4, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x1800 -; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v15, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v16, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] ; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] ; GFX8-NEXT: s_movk_i32 s0, 0x1000 -; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s0, v3 +; GFX8-NEXT: v_addc_u32_e32 v18, vcc, 0, v4, vcc ; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[17:18] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x800, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x800, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v11 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v6, v12, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v7, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v8, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v9, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v10, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v13, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v13, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v15, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v15, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v17, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v17, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v18, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: ReverseOrder: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v22, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v22 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x3000 -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX900-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX900-NEXT: s_movk_i32 s0, 0x2000 -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[12:13], off -; GFX900-NEXT: global_load_dwordx2 v[16:17], v[4:5], off -; GFX900-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 -; GFX900-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(6) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc -; GFX900-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: ReverseOrder: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v22, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v22 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[8:9], v[4:5], off +; GFX9-NEXT: s_movk_i32 s0, 0x2000 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[14:15], v[12:13], off +; GFX9-NEXT: global_load_dwordx2 v[16:17], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 +; GFX9-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc +; GFX9-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ReverseOrder: ; GFX10: ; %bb.0: ; %entry @@ -2474,15 +2228,14 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v20 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v20 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0x3800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x3000, v0 @@ -2531,76 +2284,6 @@ ; GFX10-NEXT: global_store_dwordx2 v20, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: ReverseOrder: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v22, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v22 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x3000 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[4:5], off offset:2048 -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[4:5], off -; GFX90A-NEXT: s_movk_i32 s0, 0x2000 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[4:5], off offset:2048 -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, s0, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v1, vcc -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[12:13], off -; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[4:5], off -; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[12:13], off offset:2048 -; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[0:1], off offset:2048 -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v3, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v8, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v9, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v10, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v11, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v16, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v17, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v18, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v19, v1, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v14, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v15, v1, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v20, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v21, v1, vcc -; GFX90A-NEXT: global_store_dwordx2 v22, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: ReverseOrder: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -2612,17 +2295,17 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v16, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v16 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v16 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0x3000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo @@ -2733,71 +2416,69 @@ ; GFX8-NEXT: s_mov_b32 s32, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: v_mov_b32_e32 v4, s35 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, s34, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0 -; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s35 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s34, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, 3 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc ; GFX8-NEXT: s_movk_i32 s0, 0x800 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v6, vcc -; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v6, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0, v0 ; GFX8-NEXT: v_addc_u32_e32 v6, vcc, -1, v6, vcc -; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: flat_load_dwordx2 v[3:4], v[3:4] ; GFX8-NEXT: flat_load_dwordx2 v[5:6], v[5:6] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v5, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc +; GFX8-NEXT: flat_store_dwordx2 v[1:2], v[3:4] ; GFX8-NEXT: s_endpgm ; -; GFX900-LABEL: negativeoffset: -; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX900-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX900-NEXT: s_mov_b32 s38, -1 -; GFX900-NEXT: s_mov_b32 s39, 0xe00000 -; GFX900-NEXT: s_add_u32 s36, s36, s3 -; GFX900-NEXT: s_addc_u32 s37, s37, 0 -; GFX900-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX900-NEXT: s_getpc_b64 s[0:1] -; GFX900-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX900-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX900-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX900-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX900-NEXT: v_mov_b32_e32 v31, v0 -; GFX900-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX900-NEXT: v_mov_b32_e32 v0, 0 -; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX900-NEXT: v_and_b32_e32 v1, 0xff, v0 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX900-NEXT: v_and_b32_e32 v8, 0xffff8000, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_mov_b32_e32 v0, s35 -; GFX900-NEXT: v_add_co_u32_e32 v3, vcc, s34, v8 -; GFX900-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc -; GFX900-NEXT: v_lshlrev_b64 v[0:1], 3, v[1:2] -; GFX900-NEXT: s_movk_i32 s0, 0x1000 -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, v3, v0 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc -; GFX900-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] -; GFX900-NEXT: s_endpgm +; GFX9-LABEL: negativeoffset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s3 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX9-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s35 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s34, v8 +; GFX9-NEXT: v_mov_b32_e32 v3, 3 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX9-NEXT: s_movk_i32 s0, 0x1000 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc +; GFX9-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] +; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: negativeoffset: ; GFX10: ; %bb.0: ; %entry @@ -2819,19 +2500,18 @@ ; GFX10-NEXT: s_mov_b32 s32, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 7, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] -; GFX10-NEXT: v_add_co_u32 v2, s0, s34, v8 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 7, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 3 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff8000, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_add_co_u32 v1, s0, s34, v8 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s35, 0, s0 +; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx2 v[6:7], v[2:3], off @@ -2841,49 +2521,6 @@ ; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] ; GFX10-NEXT: s_endpgm ; -; GFX90A-LABEL: negativeoffset: -; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 -; GFX90A-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 -; GFX90A-NEXT: s_mov_b32 s38, -1 -; GFX90A-NEXT: s_mov_b32 s39, 0xe00000 -; GFX90A-NEXT: s_add_u32 s36, s36, s3 -; GFX90A-NEXT: s_addc_u32 s37, s37, 0 -; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[0:1], 0x24 -; GFX90A-NEXT: s_getpc_b64 s[0:1] -; GFX90A-NEXT: s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX90A-NEXT: v_mov_b32_e32 v31, v0 -; GFX90A-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: s_mov_b32 s32, 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX90A-NEXT: v_and_b32_e32 v2, 0xff, v0 -; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 -; GFX90A-NEXT: v_and_b32_e32 v8, 0xffff8000, v0 -; GFX90A-NEXT: v_mov_b32_e32 v3, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s35 -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s34, v8 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v0, vcc -; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 3, v[2:3] -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v4, v0 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v1, vcc -; GFX90A-NEXT: s_movk_i32 s0, 0x1000 -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0, v2 -; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc -; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[6:7], v[2:3], off -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v6, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v5, vcc -; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[34:35] -; GFX90A-NEXT: s_endpgm -; ; GFX11-LABEL: negativeoffset: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_getpc_b64 s[2:3] @@ -2895,22 +2532,22 @@ ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 0xffff8000, v2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX11-NEXT: v_and_b32_e32 v4, 0xffff8000, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v2, s0, s34, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s35, 0, s0 +; GFX11-NEXT: v_add_co_u32 v1, s0, s34, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v2, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo +; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, v1, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0, v3 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:-2048 ; GFX11-NEXT: global_load_b64 v[2:3], v[2:3], off diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll @@ -10083,27 +10083,17 @@ ; GFX6-NEXT: s_add_u32 s40, s40, s3 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v5, -1, v0 -; GFX6-NEXT: v_mov_b32_e32 v6, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_mov_b32_e32 v6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] -; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v5 -; GFX6-NEXT: v_mov_b32_e32 v8, v6 -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 +; GFX6-NEXT: v_lshlrev_b32_e32 v5, 8, v0 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240 ; GFX6-NEXT: s_addc_u32 s41, s41, 0 -; GFX6-NEXT: s_mov_b32 s2, 0x83800 -; GFX6-NEXT: s_mov_b64 s[8:9], exec -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83400 +; GFX6-NEXT: s_mov_b64 s[8:9], exec ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -10111,7 +10101,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224 ; GFX6-NEXT: s_mov_b32 s2, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10120,7 +10110,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208 ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10129,7 +10119,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192 ; GFX6-NEXT: s_mov_b32 s2, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10138,7 +10128,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176 ; GFX6-NEXT: s_mov_b32 s2, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10147,7 +10137,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160 ; GFX6-NEXT: s_mov_b32 s2, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10156,7 +10146,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144 ; GFX6-NEXT: s_mov_b32 s2, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10165,7 +10155,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128 ; GFX6-NEXT: s_mov_b32 s2, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10174,7 +10164,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112 ; GFX6-NEXT: s_mov_b32 s2, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10183,7 +10173,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96 ; GFX6-NEXT: s_mov_b32 s2, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10192,7 +10182,7 @@ ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80 ; GFX6-NEXT: s_mov_b32 s2, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill @@ -10200,17 +10190,18 @@ ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[20:23], v[5:6], s[4:7], 0 addr64 offset:64 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 -; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16 ; GFX6-NEXT: s_mov_b32 s2, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_load_dwordx4 v[12:15], v[5:6], s[4:7], 0 addr64 offset:32 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b64 exec, 15 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 @@ -10225,7 +10216,7 @@ ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[8:9] -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 +; GFX6-NEXT: buffer_load_dwordx4 v[16:19], v[5:6], s[4:7], 0 addr64 offset:48 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 ; GFX6-NEXT: v_mov_b32_e32 v7, 1 @@ -10245,7 +10236,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 -; GFX6-NEXT: s_mov_b32 s12, 0x83c00 +; GFX6-NEXT: s_mov_b32 s12, 0x83800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10285,7 +10276,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s13, 5 ; GFX6-NEXT: v_writelane_b32 v4, s14, 6 ; GFX6-NEXT: v_writelane_b32 v4, s15, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84400 +; GFX6-NEXT: s_mov_b32 s38, 0x84000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10293,7 +10284,7 @@ ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x83c00 +; GFX6-NEXT: s_mov_b32 s38, 0x83800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10321,7 +10312,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s21, 5 ; GFX6-NEXT: v_writelane_b32 v4, s22, 6 ; GFX6-NEXT: v_writelane_b32 v4, s23, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 +; GFX6-NEXT: s_mov_b32 s38, 0x84800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10329,7 +10320,7 @@ ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84400 +; GFX6-NEXT: s_mov_b32 s38, 0x84000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10357,7 +10348,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s29, 5 ; GFX6-NEXT: v_writelane_b32 v4, s30, 6 ; GFX6-NEXT: v_writelane_b32 v4, s31, 7 -; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: s_mov_b32 s38, 0x85000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10365,7 +10356,7 @@ ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x84c00 +; GFX6-NEXT: s_mov_b32 s38, 0x84800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10389,7 +10380,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s1, 1 ; GFX6-NEXT: v_writelane_b32 v4, s2, 2 ; GFX6-NEXT: v_writelane_b32 v4, s3, 3 -; GFX6-NEXT: s_mov_b32 s38, 0x85c00 +; GFX6-NEXT: s_mov_b32 s38, 0x85800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10403,7 +10394,7 @@ ; GFX6-NEXT: v_writelane_b32 v4, s5, 1 ; GFX6-NEXT: v_writelane_b32 v4, s6, 2 ; GFX6-NEXT: v_writelane_b32 v4, s7, 3 -; GFX6-NEXT: s_mov_b32 s36, 0x86000 +; GFX6-NEXT: s_mov_b32 s36, 0x85c00 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10415,7 +10406,7 @@ ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_writelane_b32 v4, s2, 0 ; GFX6-NEXT: v_writelane_b32 v4, s3, 1 -; GFX6-NEXT: s_mov_b32 s4, 0x86400 +; GFX6-NEXT: s_mov_b32 s4, 0x86000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 @@ -10423,7 +10414,7 @@ ; GFX6-NEXT: s_mov_b64 exec, s[0:1] ; GFX6-NEXT: s_mov_b64 s[36:37], exec ; GFX6-NEXT: s_mov_b64 exec, 0xff -; GFX6-NEXT: s_mov_b32 s38, 0x85400 +; GFX6-NEXT: s_mov_b32 s38, 0x85000 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload @@ -10441,7 +10432,7 @@ ; GFX6-NEXT: s_mov_b64 exec, s[36:37] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2170 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload @@ -10456,7 +10447,7 @@ ; GFX6-NEXT: s_mov_b64 vcc, s[34:35] ; GFX6-NEXT: s_mov_b64 s[44:45], exec ; GFX6-NEXT: s_mov_b64 exec, 3 -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x2180 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload @@ -10472,7 +10463,7 @@ ; GFX6-NEXT: s_mov_b64 s[34:35], vcc ; GFX6-NEXT: s_mov_b64 s[4:5], exec ; GFX6-NEXT: s_mov_b64 exec, 15 -; GFX6-NEXT: s_mov_b32 s6, 0x85c00 +; GFX6-NEXT: s_mov_b32 s6, 0x85800 ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload @@ -10484,37 +10475,49 @@ ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[4:5] -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84400 -; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_mov_b32 s2, 0x84c00 -; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill -; GFX6-NEXT: buffer_store_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill -; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_mov_b32 s2, 0x85000 +; GFX6-NEXT: buffer_store_dword v20, off, s[40:43], s2 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v21, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v22, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill +; GFX6-NEXT: buffer_store_dword v23, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill +; GFX6-NEXT: s_waitcnt expcnt(3) ; GFX6-NEXT: ;;#ASMSTART ; GFX6-NEXT: ;;#ASMEND -; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x84400 -; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v21, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v22, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v23, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x84800 +; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v17, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x84000 +; GFX6-NEXT: buffer_load_dword v12, off, s[40:43], s2 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b32 s2, 0x83800 ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload @@ -10547,184 +10550,175 @@ ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: s_mov_b64 exec, s[2:3] -; GFX6-NEXT: s_mov_b32 s4, 0x83800 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s4, 0x83400 -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_mov_b32 s4, 0x83000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x82000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81400 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x81000 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:112 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x80800 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload ; GFX6-NEXT: s_mov_b32 s4, 0x80c00 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 -; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 -; GFX6-NEXT: buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32 -; GFX6-NEXT: s_waitcnt expcnt(2) -; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload -; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:16 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:80 +; GFX6-NEXT: buffer_store_dwordx4 v[20:23], v[5:6], s[0:3], 0 addr64 offset:64 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], v[5:6], s[0:3], 0 addr64 offset:48 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], v[5:6], s[0:3], 0 addr64 offset:32 +; GFX6-NEXT: s_waitcnt expcnt(3) +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[7:10], v[5:6], s[0:3], 0 addr64 offset:16 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: test_limited_sgpr: ; GFX9-FLATSCR: ; %bb.0: ; %entry ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v0 -; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:240 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:240 ; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 0 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v7, 1 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:224 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:208 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:192 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:176 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:160 -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:144 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:128 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:112 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:96 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:80 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:224 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:64 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:208 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:48 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:192 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:176 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:160 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:144 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v5, s[38:39] offset:128 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:112 ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:96 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) +; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v7, 13, v4 +; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v6, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:80 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:32 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:64 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[1:4], v0, s[38:39] offset:16 -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:48 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v0, s[38:39] -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, 16 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:32 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, v4 -; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-FLATSCR-NEXT: scratch_store_dword v4, v7, off +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] offset:16 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ; def s[0:7] ; GFX9-FLATSCR-NEXT: ;;#ASMEND @@ -10753,23 +10747,27 @@ ; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v11 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2100 -; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, v12 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, v13 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill ; GFX9-FLATSCR-NEXT: s_nop 0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20f0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20e0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v11, v0 ; GFX9-FLATSCR-NEXT: ;;#ASMSTART ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: ;;#ASMSTART @@ -10784,62 +10782,58 @@ ; GFX9-FLATSCR-NEXT: ;;#ASMEND ; GFX9-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[34:35] +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20b0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s37 -; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s36, v4 -; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:112 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20a0 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:240 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:224 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:96 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2090 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:208 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:192 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:80 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2080 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:176 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:160 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20c0 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:64 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:48 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2060 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:32 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2050 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:144 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(2) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:128 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(3) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:112 -; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:96 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:80 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:16 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37] ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2040 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:64 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:240 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload -; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2070 +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2030 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:48 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:224 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:32 -; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[6:9], s[36:37] offset:208 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:192 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:176 +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:160 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload +; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2020 +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:144 +; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16 -; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] offset:128 ; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX10-FLATSCR-LABEL: test_limited_sgpr: @@ -10850,32 +10844,31 @@ ; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 ; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 0 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, 1 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v6, 1 ; GFX10-FLATSCR-NEXT: s_mov_b32 s33, exec_lo -; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v5, -1, v0 -; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0 +; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 8, v0 ; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-FLATSCR-NEXT: s_clause 0xf -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[64:67], v0, s[38:39] offset:240 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[60:63], v0, s[38:39] offset:224 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[56:59], v0, s[38:39] offset:208 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[52:55], v0, s[38:39] offset:192 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[48:51], v0, s[38:39] offset:176 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[44:47], v0, s[38:39] offset:160 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[40:43], v0, s[38:39] offset:144 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[36:39], v0, s[38:39] offset:128 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[32:35], v0, s[38:39] offset:112 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[28:31], v0, s[38:39] offset:96 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[24:27], v0, s[38:39] offset:80 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v0, s[38:39] offset:64 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[16:19], v0, s[38:39] offset:48 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[12:15], v0, s[38:39] offset:32 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[8:11], v0, s[38:39] offset:16 -; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v0, s[38:39] +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[35:38], v5, s[38:39] offset:240 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[31:34], v5, s[38:39] offset:224 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[27:30], v5, s[38:39] offset:208 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[23:26], v5, s[38:39] offset:192 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[19:22], v5, s[38:39] offset:176 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[15:18], v5, s[38:39] offset:160 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[11:14], v5, s[38:39] offset:144 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[7:10], v5, s[38:39] offset:128 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[63:66], v5, s[38:39] offset:112 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[59:62], v5, s[38:39] offset:96 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[55:58], v5, s[38:39] offset:80 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[51:54], v5, s[38:39] offset:64 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[47:50], v5, s[38:39] offset:48 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[43:46], v5, s[38:39] offset:32 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[39:42], v5, s[38:39] offset:16 +; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[38:39] ; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLATSCR-NEXT: v_lshl_add_u32 v4, v0, 13, 16 -; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v7, off +; GFX10-FLATSCR-NEXT: scratch_store_dword v4, v6, off ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ; def s[0:7] ; GFX10-FLATSCR-NEXT: ;;#ASMEND @@ -10904,124 +10897,124 @@ ; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v63 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v58 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v57 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v56 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v62 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v61 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v60 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v35 -; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v34 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v32 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v38 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v37 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v11 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v43 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v47 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v51 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v8 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v42 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v41 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v40 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v15 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v46 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v45 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v44 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v19 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v50 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v49 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v48 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v23 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v53 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v52 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v27 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v31 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v10 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v9 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v12 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v16 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v20 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v24 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v28 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v14 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v13 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v18 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v17 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v22 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v21 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v26 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v25 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v30 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v58 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v92, v62 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v87, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v86, v56 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v85, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v91, v61 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v90, v60 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v89, v59 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v34 +; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v68, v38 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v32 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v31 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v67, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v66, v36 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v65, v35 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v10 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v72, v42 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v76, v46 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v80, v50 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v84, v54 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v7 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v71, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v70, v40 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v69, v39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v14 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v75, v45 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v74, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v73, v43 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v18 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v79, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v78, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v77, v47 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v22 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v83, v53 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v82, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v81, v51 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v26 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v30 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v9 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v8 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v11 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v15 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v19 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v23 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v27 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v13 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v12 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v17 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v16 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v21 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v20 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v25 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v24 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v29 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v28 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v33 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v53 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v49 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v45 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v41 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v37 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v34 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v35 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v36 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v57 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v54 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v55 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v56 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v50 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v51 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v52 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v46 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v47 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v48 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v42 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v43 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v44 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v38 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v39 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v40 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v58 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v59 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v60 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v7, v33 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, v53 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, v49 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, v45 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v15, v41 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, v37 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v8, v34 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v9, v35 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v36 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v31, v57 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v54 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v55 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v30, v56 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, v50 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v51 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v52 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, v46 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, v47 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v48 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, v42 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v17, v43 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, v44 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, v38 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v13, v39 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, v40 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v32, v58 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v33, v59 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v34, v60 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v65 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v66 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v67 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v68 -; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[64:67], off, s0 ; 16-byte Folded Reload -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v89 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v85 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v81 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v77 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v73 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v69 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v90 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v91 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v63, v92 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v86 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v87 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v88 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v82 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v83 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v84 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v78 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v79 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v80 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v74 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v75 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v76 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v70 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v71 -; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v72 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v65 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v36, v66 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v37, v67 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v38, v68 +; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v59, v89 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v55, v85 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v51, v81 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v47, v77 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v43, v73 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v39, v69 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v60, v90 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v61, v91 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v62, v92 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v56, v86 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v57, v87 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v58, v88 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v52, v82 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v53, v83 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v54, v84 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v48, v78 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v49, v79 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v50, v80 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v44, v74 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v45, v75 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v46, v76 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v40, v70 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v41, v71 +; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v42, v72 ; GFX10-FLATSCR-NEXT: ;;#ASMSTART ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: ;;#ASMSTART @@ -11034,26 +11027,23 @@ ; GFX10-FLATSCR-NEXT: ;;#ASMEND ; GFX10-FLATSCR-NEXT: .LBB1_2: ; %ret ; GFX10-FLATSCR-NEXT: s_or_b32 exec_lo, exec_lo, s33 -; GFX10-FLATSCR-NEXT: v_lshlrev_b64 v[4:5], 8, v[5:6] -; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, vcc_lo, s36, v4 -; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s37, v5, vcc_lo -; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[64:67], off offset:240 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[60:63], off offset:224 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[56:59], off offset:208 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[52:55], off offset:192 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[48:51], off offset:176 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[44:47], off offset:160 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[40:43], off offset:144 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[36:39], off offset:128 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[32:35], off offset:112 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[28:31], off offset:96 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[24:27], off offset:80 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[20:23], off offset:64 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[16:19], off offset:48 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[12:15], off offset:32 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[8:11], off offset:16 -; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[63:66], s[36:37] offset:112 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[59:62], s[36:37] offset:96 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[55:58], s[36:37] offset:80 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[51:54], s[36:37] offset:64 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[47:50], s[36:37] offset:48 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[43:46], s[36:37] offset:32 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[39:42], s[36:37] offset:16 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[36:37] +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[35:38], s[36:37] offset:240 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[31:34], s[36:37] offset:224 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[27:30], s[36:37] offset:208 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[23:26], s[36:37] offset:192 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[19:22], s[36:37] offset:176 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[15:18], s[36:37] offset:160 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[11:14], s[36:37] offset:144 +; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[7:10], s[36:37] offset:128 ; GFX10-FLATSCR-NEXT: s_endpgm entry: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -153,12 +153,13 @@ } ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup: -; GCN: flat_load_{{dword|b32}} -; GFX8_9: s_waitcnt lgkmcnt(0){{$}} -; GFX8_9: s_waitcnt vmcnt(0){{$}} -; GFX10PLUS: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10PLUS: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: s_barrier +; GCN: flat_load_{{dword|b32}} +; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX8_9-NEXT: s_barrier +; GFX8_9-NEXT: s_waitcnt lgkmcnt(0){{$}} +; GFX10PLUS: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} +; GFX10PLUS-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10PLUS-NEXT: s_barrier define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll --- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll +++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll @@ -8,10 +8,10 @@ ; CHECK-NEXT: movl %edx, %edx ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %r10d -; CHECK-NEXT: # kill: def $edx killed $edx def $rdx -; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $4, %r10d -; CHECK-NEXT: movl (%rdi,%rdx,4), %edx +; CHECK-NEXT: shrl $6, %edx +; CHECK-NEXT: andl $-4, %edx +; CHECK-NEXT: movl (%rdi,%rdx), %edx ; CHECK-NEXT: movzbl %dl, %edi ; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $5, %esi diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll --- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll +++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll @@ -1223,6 +1223,7 @@ ; CHECK-NEXT: lock btrq %rsi, (%rdi) ; CHECK-NEXT: jae .LBB43_1 ; CHECK-NEXT: # %bb.2: # %if.then +; CHECK-NEXT: movl %esi, %esi ; CHECK-NEXT: movq (%rdi,%rsi,8), %rax ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB43_1: @@ -1392,11 +1393,8 @@ define i64 @atomic_shl1_xor_64_const_br(ptr %v) nounwind { ; CHECK-LABEL: atomic_shl1_xor_64_const_br: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: lock btcq $4, (%rdi) -; CHECK-NEXT: setb %al -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: je .LBB48_1 +; CHECK-NEXT: jae .LBB48_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movq 32(%rdi), %rax ; CHECK-NEXT: retq @@ -1458,12 +1456,9 @@ define i64 @atomic_shl1_xor_64_const_brz(ptr %v) nounwind { ; CHECK-LABEL: atomic_shl1_xor_64_const_brz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: lock btcq $4, (%rdi) -; CHECK-NEXT: setb %al -; CHECK-NEXT: shlq $4, %rax ; CHECK-NEXT: movl $123, %eax -; CHECK-NEXT: je .LBB50_1 +; CHECK-NEXT: jae .LBB50_1 ; CHECK-NEXT: # %bb.2: # %return ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB50_1: # %if.then @@ -1524,11 +1519,8 @@ define i64 @atomic_shl1_xor_64_const_brnz(ptr %v) nounwind { ; CHECK-LABEL: atomic_shl1_xor_64_const_brnz: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: lock btcq $4, (%rdi) -; CHECK-NEXT: setb %al -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: je .LBB52_1 +; CHECK-NEXT: jae .LBB52_1 ; CHECK-NEXT: # %bb.2: # %if.then ; CHECK-NEXT: movq 32(%rdi), %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll --- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll @@ -73,7 +73,7 @@ ; CHECK-NEXT: # %bb.4: # %.preheader ; CHECK-NEXT: shlq $6, %rcx ; CHECK-NEXT: addq %rcx, %rsi -; CHECK-NEXT: shlq $6, %rax +; CHECK-NEXT: shll $6, %eax ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll --- a/llvm/test/CodeGen/X86/avxvnni-combine.ll +++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll @@ -78,7 +78,7 @@ ; AVX-NEXT: # %bb.4: # %.preheader ; AVX-NEXT: shlq $4, %rcx ; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $4, %rax +; AVX-NEXT: shll $4, %eax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 @@ -125,7 +125,7 @@ ; AVX512-NEXT: # %bb.4: # %.preheader ; AVX512-NEXT: shlq $4, %rcx ; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: shlq $4, %rax +; AVX512-NEXT: shll $4, %eax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 @@ -425,7 +425,7 @@ ; AVX-NEXT: # %bb.4: # %.preheader ; AVX-NEXT: shlq $5, %rcx ; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $5, %rax +; AVX-NEXT: shll $5, %eax ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: .p2align 4, 0x90 ; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 @@ -472,7 +472,7 @@ ; AVX512-NEXT: # %bb.4: # %.preheader ; AVX512-NEXT: shlq $5, %rcx ; AVX512-NEXT: addq %rcx, %rsi -; AVX512-NEXT: shlq $5, %rax +; AVX512-NEXT: shll $5, %eax ; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -168,8 +168,8 @@ ; CHECK64-NEXT: movzwl var16(%rip), %eax ; CHECK64-NEXT: movl %eax, %ecx ; CHECK64-NEXT: shrl $8, %ecx -; CHECK64-NEXT: shlq $8, %rax -; CHECK64-NEXT: orq %rcx, %rax +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: orl %ecx, %eax ; CHECK64-NEXT: retq %init = load i16, ptr @var16 %big = zext i16 %init to i64 @@ -197,7 +197,7 @@ ; CHECK64-LABEL: not_useful_bswap: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movzbl var8(%rip), %eax -; CHECK64-NEXT: shlq $8, %rax +; CHECK64-NEXT: shll $8, %eax ; CHECK64-NEXT: retq %init = load i8, ptr @var8 %big = zext i8 %init to i64 @@ -224,12 +224,9 @@ ; ; CHECK64-LABEL: finally_useful_bswap: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzwl var16(%rip), %ecx -; CHECK64-NEXT: movzbl %cl, %eax -; CHECK64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx -; CHECK64-NEXT: shrl $8, %ecx -; CHECK64-NEXT: shlq $8, %rax -; CHECK64-NEXT: orq %rcx, %rax +; CHECK64-NEXT: movzwl var16(%rip), %eax +; CHECK64-NEXT: bswapl %eax +; CHECK64-NEXT: shrl $16, %eax ; CHECK64-NEXT: retq %init = load i16, ptr @var16 %big = zext i16 %init to i64 diff --git a/llvm/test/CodeGen/X86/bt.ll b/llvm/test/CodeGen/X86/bt.ll --- a/llvm/test/CodeGen/X86/bt.ll +++ b/llvm/test/CodeGen/X86/bt.ll @@ -1119,6 +1119,7 @@ ; X64-NEXT: btl %ecx, %edi ; X64-NEXT: jae .LBB30_2 ; X64-NEXT: # %bb.1: +; X64-NEXT: movl %eax, %eax ; X64-NEXT: orl %edx, (%rsi,%rax,4) ; X64-NEXT: .LBB30_2: ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -835,7 +835,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl (%rdi), %eax ; CHECK-NEXT: movzbl 2(%rdi), %ecx -; CHECK-NEXT: shlq $16, %rcx +; CHECK-NEXT: shll $16, %ecx ; CHECK-NEXT: orq %rax, %rcx ; CHECK-NEXT: movq %rcx, (%rsi) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -35,8 +35,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx -; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 @@ -53,8 +53,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movzwl %di, %eax ; CHECK-NEXT: movzwl %si, %ecx -; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rax, %rcx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %zx = zext i16 %x to i64 diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -368,20 +368,20 @@ ; X64-NEXT: bswapq %rax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $235867919, %ecx # imm = 0xE0F0F0F -; X64-NEXT: shlq $4, %rcx +; X64-NEXT: shll $4, %ecx ; X64-NEXT: shrl $4, %eax ; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: orl %ecx, %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $590558003, %ecx # imm = 0x23333333 ; X64-NEXT: shrl $2, %eax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 -; X64-NEXT: leaq (%rax,%rcx,4), %rax +; X64-NEXT: leal (%rax,%rcx,4), %eax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 ; X64-NEXT: shrl %eax ; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 -; X64-NEXT: leaq (%rax,%rcx,2), %rax +; X64-NEXT: leal (%rax,%rcx,2), %eax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = shl i64 %1, 33 diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1933,8 +1933,7 @@ ; ; X64-LABEL: test_i64_2147483647_mask_shl_1: ; X64: # %bb.0: -; X64-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF -; X64-NEXT: leaq (%rdi,%rdi), %rax +; X64-NEXT: leal (%rdi,%rdi), %eax ; X64-NEXT: retq %t0 = and i64 %a0, 2147483647 %t1 = shl i64 %t0, 1 diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll --- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll +++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll @@ -96,7 +96,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: sarb $4, %dil ; CHECK-NEXT: movzbl %dil, %eax -; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: shll $4, %eax ; CHECK-NEXT: retq entry: %shr = ashr i8 %v, 4 @@ -109,9 +109,7 @@ ; CHECK-LABEL: fun8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movswl %di, %eax -; CHECK-NEXT: shrl $4, %eax -; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: andl $1048560, %eax # imm = 0xFFFF0 ; CHECK-NEXT: retq entry: %shr = ashr i16 %v, 4 @@ -140,11 +138,12 @@ define i64 @fun10(i8 zeroext %v) { ; CHECK-LABEL: fun10: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: shrb $4, %dil -; CHECK-NEXT: movzbl %dil, %ecx -; CHECK-NEXT: movq %rcx, %rax -; CHECK-NEXT: shlq $4, %rax -; CHECK-NEXT: orq %rcx, %rax +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrb $4, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: andl $-16, %edi +; CHECK-NEXT: orq %rdi, %rax ; CHECK-NEXT: retq entry: %shr = lshr i8 %v, 4 @@ -158,9 +157,9 @@ ; CHECK-LABEL: fun11: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: shrl $4, %edi -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $4, %eax +; CHECK-NEXT: andl $-16, %edi ; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: retq entry: @@ -175,9 +174,9 @@ ; CHECK-LABEL: fun12: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: shrl $4, %edi -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shlq $4, %rax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: shrl $4, %eax +; CHECK-NEXT: andl $-16, %edi ; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -425,38 +425,41 @@ define i128 @urem_i128_12(i128 %x) nounwind { ; X86-64-LABEL: urem_i128_12: ; X86-64: # %bb.0: # %entry -; X86-64-NEXT: movq %rsi, %rax -; X86-64-NEXT: shldq $62, %rdi, %rax +; X86-64-NEXT: movq %rsi, %rcx +; X86-64-NEXT: shldq $62, %rdi, %rcx ; X86-64-NEXT: shrq $2, %rsi -; X86-64-NEXT: addq %rax, %rsi -; X86-64-NEXT: adcq $0, %rsi -; X86-64-NEXT: movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB -; X86-64-NEXT: movq %rsi, %rax -; X86-64-NEXT: mulq %rcx +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: adcq $0, %rcx +; X86-64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx ; X86-64-NEXT: shrq %rdx -; X86-64-NEXT: leaq (%rdx,%rdx,2), %rax -; X86-64-NEXT: subq %rax, %rsi +; X86-64-NEXT: leal (%rdx,%rdx,2), %eax +; X86-64-NEXT: subl %eax, %ecx +; X86-64-NEXT: shll $2, %ecx ; X86-64-NEXT: andl $3, %edi -; X86-64-NEXT: leaq (%rdi,%rsi,4), %rax +; X86-64-NEXT: orq %rdi, %rcx +; X86-64-NEXT: movq %rcx, %rax ; X86-64-NEXT: xorl %edx, %edx ; X86-64-NEXT: retq ; ; WIN64-LABEL: urem_i128_12: ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movq %rdx, %r8 -; WIN64-NEXT: movq %rdx, %rax -; WIN64-NEXT: shldq $62, %rcx, %rax -; WIN64-NEXT: shrq $2, %r8 -; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: shldq $62, %rcx, %r8 +; WIN64-NEXT: shrq $2, %rdx +; WIN64-NEXT: addq %rdx, %r8 ; WIN64-NEXT: adcq $0, %r8 ; WIN64-NEXT: movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB ; WIN64-NEXT: movq %r8, %rax ; WIN64-NEXT: mulq %rdx ; WIN64-NEXT: shrq %rdx -; WIN64-NEXT: leaq (%rdx,%rdx,2), %rax -; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: leal (%rdx,%rdx,2), %eax +; WIN64-NEXT: subl %eax, %r8d +; WIN64-NEXT: shll $2, %r8d ; WIN64-NEXT: andl $3, %ecx -; WIN64-NEXT: leaq (%rcx,%r8,4), %rax +; WIN64-NEXT: orq %rcx, %r8 +; WIN64-NEXT: movq %r8, %rax ; WIN64-NEXT: xorl %edx, %edx ; WIN64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -8092,26 +8092,13 @@ ; X86-BMITBM-NEXT: incl (%eax,%ecx,4) ; X86-BMITBM-NEXT: retl ; -; X64-NOBMI-LABEL: pr38938: -; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl (%rsi), %eax -; X64-NOBMI-NEXT: shrl $21, %eax -; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF -; X64-NOBMI-NEXT: incl (%rdi,%rax,4) -; X64-NOBMI-NEXT: retq -; -; X64-BMINOTBM-LABEL: pr38938: -; X64-BMINOTBM: # %bb.0: -; X64-BMINOTBM-NEXT: movl $2581, %eax # imm = 0xA15 -; X64-BMINOTBM-NEXT: bextrl %eax, (%rsi), %eax -; X64-BMINOTBM-NEXT: incl (%rdi,%rax,4) -; X64-BMINOTBM-NEXT: retq -; -; X64-BMITBM-LABEL: pr38938: -; X64-BMITBM: # %bb.0: -; X64-BMITBM-NEXT: bextrl $2581, (%rsi), %eax # imm = 0xA15 -; X64-BMITBM-NEXT: incl (%rdi,%rax,4) -; X64-BMITBM-NEXT: retq +; X64-LABEL: pr38938: +; X64: # %bb.0: +; X64-NEXT: movl (%rsi), %eax +; X64-NEXT: shrl $19, %eax +; X64-NEXT: andl $4092, %eax # imm = 0xFFC +; X64-NEXT: incl (%rdi,%rax) +; X64-NEXT: retq %tmp = load i64, ptr %a1, align 8 %tmp1 = lshr i64 %tmp, 21 %tmp2 = and i64 %tmp1, 1023 @@ -8305,9 +8292,9 @@ ; ; X64-LABEL: c2_i64: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $49, %rax -; X64-NEXT: andl $4092, %eax # imm = 0xFFC +; X64-NEXT: shrq $51, %rdi +; X64-NEXT: andl $1023, %edi # imm = 0x3FF +; X64-NEXT: leal (,%rdi,4), %eax ; X64-NEXT: retq %tmp0 = lshr i64 %arg, 51 %tmp1 = and i64 %tmp0, 1023 @@ -8592,8 +8579,9 @@ ; ; X64-LABEL: c7_i64: ; X64: # %bb.0: -; X64-NEXT: shrq $49, %rdi -; X64-NEXT: andl $4092, %edi # imm = 0xFFC +; X64-NEXT: shrq $51, %rdi +; X64-NEXT: andl $1023, %edi # imm = 0x3FF +; X64-NEXT: shll $2, %edi ; X64-NEXT: movq %rdi, (%rsi) ; X64-NEXT: retq %tmp0 = lshr i64 %arg, 51 diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -137,7 +137,7 @@ ; SSE-NEXT: xorl %ecx, %ecx ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sets %cl -; SSE-NEXT: shlq $4, %rcx +; SSE-NEXT: shll $4, %ecx ; SSE-NEXT: movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0 ; SSE-NEXT: popq %rax ; SSE-NEXT: retq @@ -151,7 +151,7 @@ ; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testl %eax, %eax ; AVX-NEXT: sets %cl -; AVX-NEXT: shlq $4, %rcx +; AVX-NEXT: shll $4, %ecx ; AVX-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0 ; AVX-NEXT: popq %rax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/lea-dagdag.ll b/llvm/test/CodeGen/X86/lea-dagdag.ll --- a/llvm/test/CodeGen/X86/lea-dagdag.ll +++ b/llvm/test/CodeGen/X86/lea-dagdag.ll @@ -199,7 +199,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: andl $8, %esi -; CHECK-NEXT: shlq $4, %rsi +; CHECK-NEXT: shll $4, %esi ; CHECK-NEXT: leaq (%rsi,%rdi), %rax ; CHECK-NEXT: retq %t4 = and i32 %t1, 8 diff --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll --- a/llvm/test/CodeGen/X86/lea-opt2.ll +++ b/llvm/test/CodeGen/X86/lea-opt2.ll @@ -192,7 +192,7 @@ ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: testl $4095, %eax # imm = 0xFFF ; CHECK-NEXT: setne %cl -; CHECK-NEXT: shlq $12, %rcx +; CHECK-NEXT: shll $12, %ecx ; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: andq $-4096, %rcx # imm = 0xF000 ; CHECK-NEXT: addq %rcx, %rdi diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll --- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll +++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll @@ -27,33 +27,37 @@ ; GENERIC-NEXT: ## =>This Inner Loop Header: Depth=1 ; GENERIC-NEXT: movzbl %r8b, %r14d ; GENERIC-NEXT: ## kill: def $r8d killed $r8d def $r8 -; GENERIC-NEXT: shrl $24, %r8d -; GENERIC-NEXT: movl %ebx, %ebp -; GENERIC-NEXT: shrl $16, %ebp -; GENERIC-NEXT: movzbl %bpl, %r15d -; GENERIC-NEXT: movl (%rax,%r15,4), %ebp -; GENERIC-NEXT: xorl (%rdi,%r8,4), %ebp +; GENERIC-NEXT: shrl $22, %r8d +; GENERIC-NEXT: andl $-4, %r8d +; GENERIC-NEXT: movl %ebx, %r15d +; GENERIC-NEXT: shrl $14, %r15d +; GENERIC-NEXT: andl $1020, %r15d ## imm = 0x3FC +; GENERIC-NEXT: movl (%rax,%r15), %ebp +; GENERIC-NEXT: xorl (%rdi,%r8), %ebp ; GENERIC-NEXT: xorl -12(%r9), %ebp -; GENERIC-NEXT: shrl $24, %ebx +; GENERIC-NEXT: shrl $22, %ebx +; GENERIC-NEXT: andl $-4, %ebx ; GENERIC-NEXT: movl (%r10,%r14,4), %r14d -; GENERIC-NEXT: xorl (%rdi,%rbx,4), %r14d +; GENERIC-NEXT: xorl (%rdi,%rbx), %r14d ; GENERIC-NEXT: xorl -8(%r9), %r14d ; GENERIC-NEXT: movl %ebp, %r8d -; GENERIC-NEXT: shrl $24, %r8d -; GENERIC-NEXT: movl (%rdi,%r8,4), %r8d +; GENERIC-NEXT: shrl $22, %r8d +; GENERIC-NEXT: andl $-4, %r8d +; GENERIC-NEXT: movl (%rdi,%r8), %r8d ; GENERIC-NEXT: subq $1, %r11 ; GENERIC-NEXT: jb LBB0_3 ; GENERIC-NEXT: ## %bb.2: ## %bb1 ; GENERIC-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; GENERIC-NEXT: movl %r14d, %ebx -; GENERIC-NEXT: shrl $16, %ebx -; GENERIC-NEXT: movzbl %bl, %ebx -; GENERIC-NEXT: xorl (%rax,%rbx,4), %r8d +; GENERIC-NEXT: shrl $14, %ebx +; GENERIC-NEXT: andl $1020, %ebx ## imm = 0x3FC +; GENERIC-NEXT: xorl (%rax,%rbx), %r8d ; GENERIC-NEXT: xorl -4(%r9), %r8d -; GENERIC-NEXT: shrl $24, %r14d +; GENERIC-NEXT: shrl $22, %r14d +; GENERIC-NEXT: andl $-4, %r14d ; GENERIC-NEXT: movzbl %bpl, %ebx ; GENERIC-NEXT: movl (%r10,%rbx,4), %ebx -; GENERIC-NEXT: xorl (%rdi,%r14,4), %ebx +; GENERIC-NEXT: xorl (%rdi,%r14), %ebx ; GENERIC-NEXT: xorl (%r9), %ebx ; GENERIC-NEXT: addq $16, %r9 ; GENERIC-NEXT: jmp LBB0_1 @@ -61,14 +65,15 @@ ; GENERIC-NEXT: shlq $4, %rcx ; GENERIC-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 ; GENERIC-NEXT: movl %r14d, %r9d -; GENERIC-NEXT: shrl $16, %r9d -; GENERIC-NEXT: movzbl %r9b, %r9d -; GENERIC-NEXT: movzbl 2(%rax,%r9,4), %r9d +; GENERIC-NEXT: shrl $14, %r9d +; GENERIC-NEXT: andl $1020, %r9d ## imm = 0x3FC +; GENERIC-NEXT: movzbl 2(%rax,%r9), %r9d ; GENERIC-NEXT: shll $16, %r9d ; GENERIC-NEXT: orl %r8d, %r9d ; GENERIC-NEXT: xorl 16(%rcx,%rdx), %r9d -; GENERIC-NEXT: shrl $8, %r14d -; GENERIC-NEXT: movzbl 3(%rdi,%r14,4), %edi +; GENERIC-NEXT: shrl $6, %r14d +; GENERIC-NEXT: andl $-4, %r14d +; GENERIC-NEXT: movzbl 3(%rdi,%r14), %edi ; GENERIC-NEXT: shll $24, %edi ; GENERIC-NEXT: movzbl %bpl, %r8d ; GENERIC-NEXT: movzbl 2(%rax,%r8,4), %eax @@ -99,7 +104,7 @@ ; ATOM-NEXT: pushq %rbx ; ATOM-NEXT: ## kill: def $ecx killed $ecx def $rcx ; ATOM-NEXT: movl (%rdx), %r8d -; ATOM-NEXT: movl 4(%rdx), %r15d +; ATOM-NEXT: movl 4(%rdx), %ebx ; ATOM-NEXT: leaq 20(%rdx), %r9 ; ATOM-NEXT: movq _Te0@GOTPCREL(%rip), %rdi ; ATOM-NEXT: movq _Te1@GOTPCREL(%rip), %rax @@ -109,51 +114,56 @@ ; ATOM-NEXT: .p2align 4, 0x90 ; ATOM-NEXT: LBB0_1: ## %bb ; ATOM-NEXT: ## =>This Inner Loop Header: Depth=1 -; ATOM-NEXT: movl %r15d, %ebx -; ATOM-NEXT: movl %r8d, %r14d -; ATOM-NEXT: movzbl %r8b, %r8d -; ATOM-NEXT: shrl $24, %r15d -; ATOM-NEXT: shrl $16, %ebx -; ATOM-NEXT: shrl $24, %r14d -; ATOM-NEXT: movzbl %bl, %ebx -; ATOM-NEXT: movl (%rax,%rbx,4), %ebx -; ATOM-NEXT: xorl (%rdi,%r14,4), %ebx -; ATOM-NEXT: movl (%r10,%r8,4), %r14d -; ATOM-NEXT: xorl -12(%r9), %ebx -; ATOM-NEXT: xorl (%rdi,%r15,4), %r14d -; ATOM-NEXT: movl %ebx, %r8d +; ATOM-NEXT: movl %ebx, %r14d +; ATOM-NEXT: movzbl %r8b, %r15d +; ATOM-NEXT: ## kill: def $r8d killed $r8d def $r8 +; ATOM-NEXT: shrl $22, %ebx +; ATOM-NEXT: shrl $14, %r14d +; ATOM-NEXT: shrl $22, %r8d +; ATOM-NEXT: andl $-4, %ebx +; ATOM-NEXT: andl $1020, %r14d ## imm = 0x3FC +; ATOM-NEXT: andl $-4, %r8d +; ATOM-NEXT: movl (%rax,%r14), %ebp +; ATOM-NEXT: movl (%r10,%r15,4), %r14d +; ATOM-NEXT: xorl (%rdi,%r8), %ebp +; ATOM-NEXT: xorl (%rdi,%rbx), %r14d +; ATOM-NEXT: xorl -12(%r9), %ebp ; ATOM-NEXT: xorl -8(%r9), %r14d -; ATOM-NEXT: shrl $24, %r8d +; ATOM-NEXT: movl %ebp, %r8d +; ATOM-NEXT: shrl $22, %r8d +; ATOM-NEXT: andl $-4, %r8d ; ATOM-NEXT: subq $1, %r11 -; ATOM-NEXT: movl (%rdi,%r8,4), %r8d +; ATOM-NEXT: movl (%rdi,%r8), %r8d ; ATOM-NEXT: jb LBB0_3 ; ATOM-NEXT: ## %bb.2: ## %bb1 ; ATOM-NEXT: ## in Loop: Header=BB0_1 Depth=1 -; ATOM-NEXT: movl %r14d, %ebp -; ATOM-NEXT: movzbl %bl, %ebx -; ATOM-NEXT: shrl $24, %r14d -; ATOM-NEXT: shrl $16, %ebp -; ATOM-NEXT: movzbl %bpl, %r15d -; ATOM-NEXT: xorl (%rax,%r15,4), %r8d -; ATOM-NEXT: movl (%r10,%rbx,4), %r15d -; ATOM-NEXT: xorl (%rdi,%r14,4), %r15d +; ATOM-NEXT: movl %r14d, %ebx +; ATOM-NEXT: shrl $22, %r14d +; ATOM-NEXT: shrl $14, %ebx +; ATOM-NEXT: andl $-4, %r14d +; ATOM-NEXT: andl $1020, %ebx ## imm = 0x3FC +; ATOM-NEXT: xorl (%rax,%rbx), %r8d +; ATOM-NEXT: movzbl %bpl, %ebx +; ATOM-NEXT: movl (%r10,%rbx,4), %ebx ; ATOM-NEXT: xorl -4(%r9), %r8d -; ATOM-NEXT: xorl (%r9), %r15d +; ATOM-NEXT: xorl (%rdi,%r14), %ebx +; ATOM-NEXT: xorl (%r9), %ebx ; ATOM-NEXT: addq $16, %r9 ; ATOM-NEXT: jmp LBB0_1 ; ATOM-NEXT: LBB0_3: ## %bb2 ; ATOM-NEXT: movl %r14d, %r9d +; ATOM-NEXT: shrl $6, %r14d ; ATOM-NEXT: andl $-16777216, %r8d ## imm = 0xFF000000 -; ATOM-NEXT: shrl $8, %r14d ; ATOM-NEXT: shlq $4, %rcx -; ATOM-NEXT: shrl $16, %r9d -; ATOM-NEXT: movzbl 3(%rdi,%r14,4), %edi -; ATOM-NEXT: movzbl %r9b, %r9d +; ATOM-NEXT: shrl $14, %r9d +; ATOM-NEXT: andl $-4, %r14d +; ATOM-NEXT: andl $1020, %r9d ## imm = 0x3FC +; ATOM-NEXT: movzbl 3(%rdi,%r14), %edi +; ATOM-NEXT: movzbl 2(%rax,%r9), %r9d ; ATOM-NEXT: shll $24, %edi -; ATOM-NEXT: movzbl 2(%rax,%r9,4), %r9d ; ATOM-NEXT: shll $16, %r9d ; ATOM-NEXT: orl %r8d, %r9d -; ATOM-NEXT: movzbl %bl, %r8d +; ATOM-NEXT: movzbl %bpl, %r8d ; ATOM-NEXT: movzbl 2(%rax,%r8,4), %eax ; ATOM-NEXT: xorl 16(%rcx,%rdx), %r9d ; ATOM-NEXT: shll $16, %eax diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll --- a/llvm/test/CodeGen/X86/masked_compressstore.ll +++ b/llvm/test/CodeGen/X86/masked_compressstore.ll @@ -531,9 +531,10 @@ ; AVX512F-NEXT: addl %ecx, %eax ; AVX512F-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F ; AVX512F-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 -; AVX512F-NEXT: shrl $24, %eax +; AVX512F-NEXT: shrl $21, %eax +; AVX512F-NEXT: andl $-8, %eax ; AVX512F-NEXT: kshiftrw $8, %k1, %k2 -; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} +; AVX512F-NEXT: vcompresspd %zmm1, (%rdi,%rax) {%k2} ; AVX512F-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -558,9 +559,10 @@ ; AVX512VLDQ-NEXT: addl %eax, %ecx ; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512VLDQ-NEXT: shrl $24, %eax +; AVX512VLDQ-NEXT: shrl $21, %eax +; AVX512VLDQ-NEXT: andl $-8, %eax ; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2 -; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} +; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax) {%k2} ; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: vzeroupper ; AVX512VLDQ-NEXT: retq @@ -584,9 +586,10 @@ ; AVX512VLBW-NEXT: addl %ecx, %eax ; AVX512VLBW-NEXT: andl $252645135, %eax ## imm = 0xF0F0F0F ; AVX512VLBW-NEXT: imull $16843009, %eax, %eax ## imm = 0x1010101 -; AVX512VLBW-NEXT: shrl $24, %eax +; AVX512VLBW-NEXT: shrl $21, %eax +; AVX512VLBW-NEXT: andl $-8, %eax ; AVX512VLBW-NEXT: kshiftrw $8, %k1, %k2 -; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2} +; AVX512VLBW-NEXT: vcompresspd %zmm1, (%rdi,%rax) {%k2} ; AVX512VLBW-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq @@ -2444,8 +2447,9 @@ ; AVX512-NEXT: addl %eax, %ecx ; AVX512-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512-NEXT: shrl $24, %eax -; AVX512-NEXT: vcompressps %zmm1, (%rdi,%rax,4) {%k1} +; AVX512-NEXT: shrl $22, %eax +; AVX512-NEXT: andl $-4, %eax +; AVX512-NEXT: vcompressps %zmm1, (%rdi,%rax) {%k1} ; AVX512-NEXT: vcompressps %zmm0, (%rdi) {%k2} ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1023,8 +1023,9 @@ ; AVX512F-NEXT: addl %eax, %ecx ; AVX512F-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512F-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512F-NEXT: shrl $24, %eax -; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} +; AVX512F-NEXT: shrl $21, %eax +; AVX512F-NEXT: andl $-8, %eax +; AVX512F-NEXT: vexpandpd (%rdi,%rax), %zmm1 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VLDQ-LABEL: expandload_v16f64_v16i32: @@ -1047,8 +1048,9 @@ ; AVX512VLDQ-NEXT: addl %eax, %ecx ; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512VLDQ-NEXT: shrl $24, %eax -; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} +; AVX512VLDQ-NEXT: shrl $21, %eax +; AVX512VLDQ-NEXT: andl $-8, %eax +; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax), %zmm1 {%k1} ; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2} ; AVX512VLDQ-NEXT: retq ; @@ -1074,8 +1076,9 @@ ; AVX512VLBW-NEXT: addl %eax, %ecx ; AVX512VLBW-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512VLBW-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512VLBW-NEXT: shrl $24, %eax -; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1} +; AVX512VLBW-NEXT: shrl $21, %eax +; AVX512VLBW-NEXT: andl $-8, %eax +; AVX512VLBW-NEXT: vexpandpd (%rdi,%rax), %zmm1 {%k1} ; AVX512VLBW-NEXT: retq %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x double> @llvm.masked.expandload.v16f64(ptr %base, <16 x i1> %mask, <16 x double> %src0) @@ -2652,8 +2655,9 @@ ; AVX512-NEXT: addl %eax, %ecx ; AVX512-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F ; AVX512-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101 -; AVX512-NEXT: shrl $24, %eax -; AVX512-NEXT: vexpandps (%rdi,%rax,4), %zmm1 {%k2} +; AVX512-NEXT: shrl $22, %eax +; AVX512-NEXT: andl $-4, %eax +; AVX512-NEXT: vexpandps (%rdi,%rax), %zmm1 {%k2} ; AVX512-NEXT: vexpandps (%rdi), %zmm0 {%k1} ; AVX512-NEXT: retq %mask = icmp eq <32 x i32> %trigger, zeroinitializer diff --git a/llvm/test/CodeGen/X86/or-address.ll b/llvm/test/CodeGen/X86/or-address.ll --- a/llvm/test/CodeGen/X86/or-address.ll +++ b/llvm/test/CodeGen/X86/or-address.ll @@ -24,10 +24,14 @@ ; CHECK-NEXT: addb %r8b, %r9b ; CHECK-NEXT: shlb $2, %r9b ; CHECK-NEXT: movzbl %r9b, %r9d -; CHECK-NEXT: movl %esi, (%rdi,%r9,4) -; CHECK-NEXT: movl %esi, 8(%rdi,%r9,4) -; CHECK-NEXT: movl %esi, 4(%rdi,%r9,4) -; CHECK-NEXT: movl %esi, 12(%rdi,%r9,4) +; CHECK-NEXT: leal (,%r9,4), %r10d +; CHECK-NEXT: movl %esi, (%rdi,%r10) +; CHECK-NEXT: leal 8(,%r9,4), %r10d +; CHECK-NEXT: movl %esi, (%rdi,%r10) +; CHECK-NEXT: leal 4(,%r9,4), %r10d +; CHECK-NEXT: movl %esi, (%rdi,%r10) +; CHECK-NEXT: leal 12(,%r9,4), %r9d +; CHECK-NEXT: movl %esi, (%rdi,%r9) ; CHECK-NEXT: incb %r8b ; CHECK-NEXT: decb %al ; CHECK-NEXT: jne LBB0_1 @@ -88,10 +92,14 @@ ; CHECK-NEXT: shlb $4, %r10b ; CHECK-NEXT: addb %r9b, %r10b ; CHECK-NEXT: movzbl %r10b, %r9d -; CHECK-NEXT: movl %esi, (%rdi,%r9,4) -; CHECK-NEXT: movl %esi, 8(%rdi,%r9,4) -; CHECK-NEXT: movl %esi, 4(%rdi,%r9,4) -; CHECK-NEXT: movl %esi, 12(%rdi,%r9,4) +; CHECK-NEXT: leal (,%r9,4), %r10d +; CHECK-NEXT: movl %esi, (%rdi,%r10) +; CHECK-NEXT: leal 8(,%r9,4), %r10d +; CHECK-NEXT: movl %esi, (%rdi,%r10) +; CHECK-NEXT: leal 4(,%r9,4), %r10d +; CHECK-NEXT: movl %esi, (%rdi,%r10) +; CHECK-NEXT: leal 12(,%r9,4), %r9d +; CHECK-NEXT: movl %esi, (%rdi,%r9) ; CHECK-NEXT: incb %cl ; CHECK-NEXT: decb %al ; CHECK-NEXT: jne LBB1_1 diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -637,7 +637,7 @@ ; X64-NOPOPCNT-NEXT: xorl %eax, %eax ; X64-NOPOPCNT-NEXT: xorb %ch, %cl ; X64-NOPOPCNT-NEXT: setnp %al -; X64-NOPOPCNT-NEXT: addq %rax, %rax +; X64-NOPOPCNT-NEXT: addl %eax, %eax ; X64-NOPOPCNT-NEXT: retq ; ; X86-POPCNT-LABEL: parity_64_shift: @@ -654,7 +654,7 @@ ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntq %rdi, %rax ; X64-POPCNT-NEXT: andl $1, %eax -; X64-POPCNT-NEXT: addq %rax, %rax +; X64-POPCNT-NEXT: addl %eax, %eax ; X64-POPCNT-NEXT: retq %2 = tail call i64 @llvm.ctpop.i64(i64 %0) %3 = shl nuw nsw i64 %2, 1 diff --git a/llvm/test/CodeGen/X86/pr22970.ll b/llvm/test/CodeGen/X86/pr22970.ll --- a/llvm/test/CodeGen/X86/pr22970.ll +++ b/llvm/test/CodeGen/X86/pr22970.ll @@ -15,7 +15,8 @@ ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: andl $4095, %esi # imm = 0xFFF -; X64-NEXT: movl 32(%rdi,%rsi,4), %eax +; X64-NEXT: leal 32(,%rsi,4), %eax +; X64-NEXT: movl (%rdi,%rax), %eax ; X64-NEXT: retq %3 = and i32 %1, 4095 %4 = add nuw nsw i32 %3, 8 @@ -37,7 +38,8 @@ ; X64-LABEL: PR22970_i64: ; X64: # %bb.0: ; X64-NEXT: andl $4095, %esi # imm = 0xFFF -; X64-NEXT: movl 32(%rdi,%rsi,4), %eax +; X64-NEXT: leal 32(,%rsi,4), %eax +; X64-NEXT: movl (%rdi,%rax), %eax ; X64-NEXT: retq %3 = and i64 %1, 4095 %4 = add nuw nsw i64 %3, 8 diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll --- a/llvm/test/CodeGen/X86/pr38217.ll +++ b/llvm/test/CodeGen/X86/pr38217.ll @@ -24,12 +24,14 @@ ; CHECK-NEXT: shrq $32, %rax ; CHECK-NEXT: imull $100, %eax, %r10d ; CHECK-NEXT: subl %r10d, %r9d +; CHECK-NEXT: addl %r9d, %r9d +; CHECK-NEXT: addl %eax, %eax ; CHECK-NEXT: movl %ecx, %r10d ; CHECK-NEXT: movq %rsi, %r11 ; CHECK-NEXT: subq %r10, %r11 -; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%r9,%r9), %r9d +; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%r9), %r9d ; CHECK-NEXT: movw %r9w, -1(%r11) -; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%rax,%rax), %eax +; CHECK-NEXT: movzwl _ZL11DIGIT_TABLE(%rax), %eax ; CHECK-NEXT: movw %ax, -3(%r11) ; CHECK-NEXT: addl $4, %ecx ; CHECK-NEXT: cmpq $99999999, %rdi # imm = 0x5F5E0FF diff --git a/llvm/test/CodeGen/X86/pr62653.ll b/llvm/test/CodeGen/X86/pr62653.ll --- a/llvm/test/CodeGen/X86/pr62653.ll +++ b/llvm/test/CodeGen/X86/pr62653.ll @@ -4,124 +4,117 @@ define <64 x i4> @pr62653(<64 x i4> %a0) nounwind { ; CHECK-LABEL: pr62653: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $r9d killed $r9d def $r9 -; CHECK-NEXT: # kill: def $r8d killed $r8d def $r8 -; CHECK-NEXT: # kill: def $ecx killed $ecx def $rcx -; CHECK-NEXT: # kill: def $edx killed $edx def $rdx -; CHECK-NEXT: # kill: def $esi killed $esi def $rsi ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shll $4, %edi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $4, %r10 ; CHECK-NEXT: orq %rdi, %r10 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: shlq $8, %rdi +; CHECK-NEXT: shll $8, %edi ; CHECK-NEXT: orq %r10, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $12, %r10 +; CHECK-NEXT: shll $12, %r10d ; CHECK-NEXT: orq %rdi, %r10 -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $16, %r11 -; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi -; CHECK-NEXT: shlq $20, %rdi -; CHECK-NEXT: orq %r11, %rdi +; CHECK-NEXT: shll $16, %edi +; CHECK-NEXT: orq %r10, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $24, %r10 +; CHECK-NEXT: shll $20, %r10d ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $28, %r11 +; CHECK-NEXT: shll $24, %r11d ; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $32, %r10 +; CHECK-NEXT: shll $28, %r10d ; CHECK-NEXT: orq %r11, %r10 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d -; CHECK-NEXT: shlq $36, %r11 +; CHECK-NEXT: shlq $32, %r11 ; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $40, %r10 +; CHECK-NEXT: shlq $36, %r10 ; CHECK-NEXT: orq %r11, %r10 +; CHECK-NEXT: orq %rdi, %r10 +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $40, %rdi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d ; CHECK-NEXT: andl $15, %r11d ; CHECK-NEXT: shlq $44, %r11 -; CHECK-NEXT: orq %r10, %r11 ; CHECK-NEXT: orq %rdi, %r11 ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-NEXT: andl $15, %edi ; CHECK-NEXT: shlq $48, %rdi -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; CHECK-NEXT: andl $15, %r10d -; CHECK-NEXT: shlq $52, %r10 -; CHECK-NEXT: orq %rdi, %r10 -; CHECK-NEXT: orq %r11, %r10 -; CHECK-NEXT: movq %r10, 8(%rax) +; CHECK-NEXT: orq %r11, %rdi +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d +; CHECK-NEXT: andl $15, %r11d +; CHECK-NEXT: shlq $52, %r11 +; CHECK-NEXT: orq %rdi, %r11 +; CHECK-NEXT: orq %r10, %r11 +; CHECK-NEXT: movq %r11, 8(%rax) +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: andl $15, %edi +; CHECK-NEXT: shlq $32, %rdi ; CHECK-NEXT: andl $15, %esi ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $4, %rdx -; CHECK-NEXT: orq %rsi, %rdx +; CHECK-NEXT: shll $4, %edx +; CHECK-NEXT: orl %esi, %edx ; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $8, %rcx -; CHECK-NEXT: orq %rdx, %rcx +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: orl %edx, %ecx ; CHECK-NEXT: andl $15, %r8d -; CHECK-NEXT: shlq $12, %r8 -; CHECK-NEXT: orq %rcx, %r8 +; CHECK-NEXT: shll $12, %r8d +; CHECK-NEXT: orl %ecx, %r8d ; CHECK-NEXT: andl $15, %r9d -; CHECK-NEXT: shlq $16, %r9 -; CHECK-NEXT: orq %r8, %r9 +; CHECK-NEXT: shll $16, %r9d +; CHECK-NEXT: orl %r8d, %r9d ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $20, %rcx -; CHECK-NEXT: orq %r9, %rcx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $24, %rsi +; CHECK-NEXT: shll $20, %ecx +; CHECK-NEXT: orl %r9d, %ecx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $28, %rdx -; CHECK-NEXT: orq %rsi, %rdx -; CHECK-NEXT: orq %rcx, %rdx -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $32, %rcx +; CHECK-NEXT: shll $24, %edx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $36, %rsi -; CHECK-NEXT: orq %rcx, %rsi +; CHECK-NEXT: shll $28, %esi +; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: orl %ecx, %esi +; CHECK-NEXT: orq %rdi, %rsi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: andl $15, %ecx -; CHECK-NEXT: shlq $40, %rcx +; CHECK-NEXT: shlq $36, %rcx ; CHECK-NEXT: orq %rsi, %rcx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: shlq $40, %rdx +; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shlq $44, %rcx ; CHECK-NEXT: orq %rdx, %rcx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $44, %rdx +; CHECK-NEXT: shlq $48, %rdx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi ; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $48, %rsi +; CHECK-NEXT: shlq $52, %rsi ; CHECK-NEXT: orq %rdx, %rsi ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %edx ; CHECK-NEXT: andl $15, %edx -; CHECK-NEXT: shlq $52, %rdx +; CHECK-NEXT: shlq $56, %rdx ; CHECK-NEXT: orq %rsi, %rdx ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: andl $15, %esi -; CHECK-NEXT: shlq $56, %rsi +; CHECK-NEXT: shlq $60, %rsi ; CHECK-NEXT: orq %rdx, %rsi ; CHECK-NEXT: orq %rcx, %rsi -; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: shlq $60, %rcx -; CHECK-NEXT: orq %rsi, %rcx -; CHECK-NEXT: movq %rcx, (%rax) +; CHECK-NEXT: movq %rsi, (%rax) ; CHECK-NEXT: retq %res = shufflevector <64 x i4> %a0, <64 x i4> zeroinitializer, <64 x i32> ret <64 x i4> %res diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll --- a/llvm/test/CodeGen/X86/select.ll +++ b/llvm/test/CodeGen/X86/select.ll @@ -393,22 +393,22 @@ define x86_fp80 @test7(i32 %tmp8) nounwind { ; GENERIC-LABEL: test7: ; GENERIC: ## %bb.0: -; GENERIC-NEXT: xorl %eax, %eax -; GENERIC-NEXT: testl %edi, %edi -; GENERIC-NEXT: setns %al -; GENERIC-NEXT: shlq $4, %rax -; GENERIC-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx -; GENERIC-NEXT: fldt (%rax,%rcx) +; GENERIC-NEXT: ## kill: def $edi killed $edi def $rdi +; GENERIC-NEXT: notl %edi +; GENERIC-NEXT: shrl $27, %edi +; GENERIC-NEXT: andl $-16, %edi +; GENERIC-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; GENERIC-NEXT: fldt (%rdi,%rax) ; GENERIC-NEXT: retq ; ; ATOM-LABEL: test7: ; ATOM: ## %bb.0: -; ATOM-NEXT: xorl %eax, %eax -; ATOM-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx -; ATOM-NEXT: testl %edi, %edi -; ATOM-NEXT: setns %al -; ATOM-NEXT: shlq $4, %rax -; ATOM-NEXT: fldt (%rax,%rcx) +; ATOM-NEXT: ## kill: def $edi killed $edi def $rdi +; ATOM-NEXT: leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax +; ATOM-NEXT: notl %edi +; ATOM-NEXT: shrl $27, %edi +; ATOM-NEXT: andl $-16, %edi +; ATOM-NEXT: fldt (%rdi,%rax) ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test7: diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll --- a/llvm/test/CodeGen/X86/select_const.ll +++ b/llvm/test/CodeGen/X86/select_const.ll @@ -628,7 +628,7 @@ ; X64: # %bb.0: ; X64-NEXT: xorb $1, %dil ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $7, %rax +; X64-NEXT: shll $7, %eax ; X64-NEXT: addq $-99, %rax ; X64-NEXT: retq %sel = select i1 %cond, i64 -99, i64 29 diff --git a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll --- a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll +++ b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll @@ -194,7 +194,7 @@ ; ANY: # %bb.0: ; ANY-NEXT: movl %edi, %eax ; ANY-NEXT: andl $1, %eax -; ANY-NEXT: shlq $16, %rax +; ANY-NEXT: shll $16, %eax ; ANY-NEXT: retq %shl = select i1 %t, i64 65536, i64 0 ret i64 %shl diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll --- a/llvm/test/CodeGen/X86/setcc.ll +++ b/llvm/test/CodeGen/X86/setcc.ll @@ -64,7 +64,7 @@ ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: cmpq $18, %rdi ; X64-NEXT: setb %al -; X64-NEXT: shlq $6, %rax +; X64-NEXT: shll $6, %eax ; X64-NEXT: retq %t0 = icmp ult i64 %x, 18 %if = select i1 %t0, i64 64, i64 0 diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -15,9 +15,8 @@ ; X64-LABEL: test_lshr_and: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: shrl $2, %edi -; X64-NEXT: andl $3, %edi -; X64-NEXT: movl array(,%rdi,4), %eax +; X64-NEXT: andl $12, %edi +; X64-NEXT: movl array(%rdi), %eax ; X64-NEXT: retq %tmp2 = lshr i32 %x, 2 %tmp3 = and i32 %tmp2, 3 @@ -104,8 +103,8 @@ ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: subl %edi, %esi -; X64-NEXT: shrl $3, %esi -; X64-NEXT: leaq (%rdx,%rsi,4), %rax +; X64-NEXT: shrl %esi +; X64-NEXT: leaq (%rsi,%rdx), %rax ; X64-NEXT: retq %sub = sub i32 %b, %a %shr = lshr exact i32 %sub, 3 @@ -126,8 +125,8 @@ ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: subl %edi, %esi -; X64-NEXT: shrl $3, %esi -; X64-NEXT: leaq (%rdx,%rsi,4), %rax +; X64-NEXT: shrl %esi +; X64-NEXT: leaq (%rsi,%rdx), %rax ; X64-NEXT: retq %sub = sub i32 %b, %a %shr = lshr exact i32 %sub, 3 diff --git a/llvm/test/CodeGen/X86/shift-pair.ll b/llvm/test/CodeGen/X86/shift-pair.ll --- a/llvm/test/CodeGen/X86/shift-pair.ll +++ b/llvm/test/CodeGen/X86/shift-pair.ll @@ -4,9 +4,8 @@ define i64 @test(i64 %A) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: shrq $54, %rax -; CHECK-NEXT: andl $-4, %eax +; CHECK-NEXT: shrq $56, %rdi +; CHECK-NEXT: leal (,%rdi,4), %eax ; CHECK-NEXT: retq %B = lshr i64 %A, 56 %C = shl i64 %B, 2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -255,28 +255,28 @@ ; SSE2-NEXT: andl $7, %r8d ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: andl $7, %r9d -; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; SSE2-NEXT: movd %r10d, %xmm0 ; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; @@ -299,28 +299,28 @@ ; SSSE3-NEXT: andl $7, %r8d ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $7, %r9d -; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %r10d +; SSSE3-NEXT: movd %r10d, %xmm0 ; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -293,52 +293,52 @@ ; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: andl $15, %edi -; AVX1-NEXT: vmovaps %ymm0, (%rsp) -; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: andl $15, %esi -; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %edx -; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %ecx -; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %r8d -; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $15, %r9d -; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0 -; AVX1-NEXT: movl 16(%rbp), %eax -; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX1-NEXT: movl 24(%rbp), %eax -; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 32(%rbp), %eax ; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vmovaps %ymm0, (%rsp) ; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: movl 40(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 48(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 56(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 64(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 72(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 80(%rbp), %eax ; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl 88(%rbp), %eax ; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %r8d +; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %r9d +; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 +; AVX1-NEXT: movl 16(%rbp), %eax +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: movl 24(%rbp), %eax +; AVX1-NEXT: andl $15, %eax ; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq %rbp, %rsp ; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq @@ -355,52 +355,52 @@ ; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: vmovaps %ymm0, (%rsp) -; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: andl $15, %esi -; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %r8d -; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $15, %r9d -; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0 -; AVX2-NEXT: movl 16(%rbp), %eax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 -; AVX2-NEXT: movl 24(%rbp), %eax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 32(%rbp), %eax ; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vmovaps %ymm0, (%rsp) ; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: movl 40(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 48(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 56(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 64(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 72(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 80(%rbp), %eax ; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl 88(%rbp), %eax ; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %r8d +; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %r9d +; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1 +; AVX2-NEXT: movl 16(%rbp), %eax +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: movl 24(%rbp), %eax +; AVX2-NEXT: andl $15, %eax ; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: movq %rbp, %rsp ; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq @@ -448,52 +448,52 @@ ; AVX1-NEXT: # kill: def $edx killed $edx def $rdx ; AVX1-NEXT: # kill: def $esi killed $esi def $rsi ; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: andl $7, %edi -; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: andl $7, %esi -; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %edx -; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %ecx -; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %r8d -; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVX1-NEXT: andl $7, %r9d -; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax -; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX1-NEXT: andl $7, %edi +; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: andl $7, %esi +; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %edx +; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %ecx +; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %r8d +; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 +; AVX1-NEXT: andl $7, %r9d +; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1 ; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX1-NEXT: andl $7, %eax ; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16: @@ -504,52 +504,52 @@ ; AVX2-NEXT: # kill: def $edx killed $edx def $rdx ; AVX2-NEXT: # kill: def $esi killed $esi def $rsi ; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: andl $7, %edi -; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: andl $7, %esi -; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %edx -; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %ecx -; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %r8d -; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0 -; AVX2-NEXT: andl $7, %r9d -; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax -; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0 +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax +; AVX2-NEXT: vmovd %eax, %xmm1 +; AVX2-NEXT: andl $7, %esi +; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %edx +; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %ecx +; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1 +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1 ; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; AVX2-NEXT: andl $7, %eax ; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %x0 = extractelement <8 x i16> %x, i32 %i0 %x1 = extractelement <8 x i16> %x, i32 %i1 @@ -597,13 +597,13 @@ ; ALL-NEXT: movq %rsp, %rbp ; ALL-NEXT: andq $-32, %rsp ; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: movl 8(%rdi), %ecx ; ALL-NEXT: andl $3, %eax ; ALL-NEXT: andl $3, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx +; ALL-NEXT: movl 16(%rdi), %edx ; ALL-NEXT: andl $3, %edx -; ALL-NEXT: movq 24(%rdi), %rsi +; ALL-NEXT: movl 24(%rdi), %esi ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -637,13 +637,13 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, ptr %i) nounwind { ; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64: ; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq 8(%rdi), %rcx +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: movl 8(%rdi), %ecx ; ALL-NEXT: andl $1, %eax ; ALL-NEXT: andl $1, %ecx -; ALL-NEXT: movq 16(%rdi), %rdx +; ALL-NEXT: movl 16(%rdi), %edx ; ALL-NEXT: andl $1, %edx -; ALL-NEXT: movq 24(%rdi), %rsi +; ALL-NEXT: movl 24(%rdi), %esi ; ALL-NEXT: andl $1, %esi ; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -651,18 +651,18 @@ ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: andl $1, %eax -; SSE-NEXT: shlq $15, %rax +; SSE-NEXT: shll $15, %eax ; SSE-NEXT: retq ; ; AVX-LABEL: vselect_any_extend_vector_inreg_crash: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax -; AVX-NEXT: shlq $15, %rax +; AVX-NEXT: shll $15, %eax ; AVX-NEXT: retq 0: %1 = load <8 x i8>, ptr %x diff --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll --- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll +++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll @@ -14,7 +14,7 @@ ; ; X64-LABEL: test1: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: shll $2, %eax ; X64-NEXT: andl $60, %eax ; X64-NEXT: retq @@ -37,7 +37,7 @@ ; ; X64-LABEL: test2: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax +; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: andl $15, %eax ; X64-NEXT: leaq (%rdi,%rax,4), %rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll --- a/llvm/test/CodeGen/X86/zext-shl.ll +++ b/llvm/test/CodeGen/X86/zext-shl.ll @@ -51,7 +51,7 @@ ; X64-LABEL: i64_zext_shift_i16_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $5, %rax +; X64-NEXT: shll $5, %eax ; X64-NEXT: retq %t0 = zext i8 %a0 to i16 %t1 = shl i16 %t0, 5 @@ -112,7 +112,7 @@ ; X64-LABEL: i128_zext_shift_i64_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shlq $4, %rax +; X64-NEXT: shll $4, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %t0 = zext i8 %a0 to i64 @@ -136,7 +136,7 @@ ; X64-LABEL: i128_zext_shift_i64_zext_i16: ; X64: # %bb.0: ; X64-NEXT: movzwl %di, %eax -; X64-NEXT: shlq $7, %rax +; X64-NEXT: shll $7, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %t0 = zext i16 %a0 to i64