diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6152,55 +6152,6 @@ } } - // Reduce bit extract of low half of an integer to the narrower type. - // (and (srl i64:x, K), KMask) -> - // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) - if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { - if (ConstantSDNode *CAnd = dyn_cast(N1)) { - if (ConstantSDNode *CShift = dyn_cast(N0.getOperand(1))) { - unsigned Size = VT.getSizeInBits(); - const APInt &AndMask = CAnd->getAPIntValue(); - unsigned ShiftBits = CShift->getZExtValue(); - - // Bail out, this node will probably disappear anyway. - if (ShiftBits == 0) - return SDValue(); - - unsigned MaskBits = AndMask.countr_one(); - EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); - - if (AndMask.isMask() && - // Required bits must not span the two halves of the integer and - // must fit in the half size type. - (ShiftBits + MaskBits <= Size / 2) && - TLI.isNarrowingProfitable(VT, HalfVT) && - TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && - TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && - TLI.isTruncateFree(VT, HalfVT) && - TLI.isZExtFree(HalfVT, VT)) { - // The isNarrowingProfitable is to avoid regressions on PPC and - // AArch64 which match a few 64-bit bit insert / bit extract patterns - // on downstream users of this. Those patterns could probably be - // extended to handle extensions mixed in. - - SDValue SL(N0); - assert(MaskBits <= Size); - - // Extracting the highest bit of the low half. - EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); - SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, - N0.getOperand(0)); - - SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); - SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); - SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); - SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); - return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); - } - } - } - } - return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1863,6 +1863,27 @@ if (Op->getFlags().hasExact()) InDemandedMask.setLowBits(ShAmt); + // Narrow shift to lower half - similar to ShrinkDemandedOp. + // (srl i64:x, K) -> (i64 zero_extend (srl (i32 (trunc i64:x)), K)) + if ((BitWidth % 2) == 0 && !VT.isVector() && + ((InDemandedMask.countLeadingZeros() >= (BitWidth / 2)) || + TLO.DAG.MaskedValueIsZero( + Op0, APInt::getHighBitsSet(BitWidth, BitWidth / 2)))) { + EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), BitWidth / 2); + if (isNarrowingProfitable(VT, HalfVT) && + isTypeDesirableForOp(ISD::SRL, HalfVT) && + isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) && + (!TLO.LegalOperations() || isOperationLegal(ISD::SRL, VT))) { + SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0); + SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant( + ShAmt, HalfVT, dl, TLO.LegalTypes()); + SDValue NewShift = + TLO.DAG.getNode(ISD::SRL, dl, HalfVT, NewOp, NewShiftAmt); + return TLO.CombineTo( + Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift)); + } + } + // Compute the new bits that are at the top now. if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO, Depth + 1)) diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -963,21 +963,19 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8 ; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8 +; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2 ; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 -; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8 -; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX7-NEXT: v_ashrrev_i32_e32 v8, 24, v0 ; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_alignbit_b32 v2, 0, v2, 16 -; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 16 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1850,28 +1850,24 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v3, 0xff00, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v6, 0xff00, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v2 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX7-NEXT: v_alignbit_b32 v2, v4, v2, 16 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_alignbit_b32 v0, v7, v0, 16 -; GFX7-NEXT: v_alignbit_b32 v3, 0, v3, 16 -; GFX7-NEXT: v_alignbit_b32 v6, 0, v6, 16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v0 +; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 +; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 +; GFX7-NEXT: v_alignbit_b32 v0, v6, v0, 16 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v5, v4, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX7-NEXT: v_mad_u32_u24 v1, v5, v7, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v4, v3, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v7, v8, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v6, v5, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2014,48 +2014,48 @@ ; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_bfe_i32 v8, v2, 0, 4 -; GFX7-NEXT: v_bfe_i32 v7, v2, 4, 4 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v15, v0, 0, 4 -; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX7-NEXT: v_bfe_i32 v14, v0, 4, 4 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX7-NEXT: v_bfe_i32 v6, v2, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX7-NEXT: v_bfe_i32 v13, v0, 8, 4 -; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 +; GFX7-NEXT: v_bfe_i32 v6, v2, 0, 4 ; GFX7-NEXT: v_bfe_i32 v3, v2, 24, 4 -; GFX7-NEXT: v_bfe_i32 v4, v2, 20, 4 -; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v9, 28, v2 -; GFX7-NEXT: v_bfe_i32 v2, v2, 12, 4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_bfe_i32 v13, v0, 0, 4 +; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 4 +; GFX7-NEXT: v_bfe_i32 v5, v2, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v7, 28, v2 +; GFX7-NEXT: v_bfe_i32 v8, v2, 20, 4 +; GFX7-NEXT: v_bfe_i32 v9, v2, 12, 4 +; GFX7-NEXT: v_bfe_i32 v2, v2, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX7-NEXT: v_bfe_i32 v10, v0, 24, 4 -; GFX7-NEXT: v_bfe_i32 v11, v0, 20, 4 -; GFX7-NEXT: v_bfe_i32 v12, v0, 16, 4 -; GFX7-NEXT: v_ashrrev_i32_e32 v16, 28, v0 -; GFX7-NEXT: v_bfe_i32 v0, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v11, v0, 16, 4 +; GFX7-NEXT: v_bfe_i32 v12, v0, 8, 4 +; GFX7-NEXT: v_ashrrev_i32_e32 v14, 28, v0 +; GFX7-NEXT: v_bfe_i32 v15, v0, 20, 4 +; GFX7-NEXT: v_bfe_i32 v16, v0, 12, 4 +; GFX7-NEXT: v_bfe_i32 v0, v0, 4, 4 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v6, v13, v1 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX7-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; GFX7-NEXT: v_mad_u32_u24 v0, v3, v10, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -2581,12 +2581,10 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 ; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v13 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 -; GFX7-NEXT: v_alignbit_b32 v9, 0, v9, 24 -; GFX7-NEXT: v_alignbit_b32 v16, 0, v16, 24 +; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX7-NEXT: v_and_b32_e32 v16, 0xff, v16 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX7-NEXT: v_and_b32_e32 v12, 0xff, v12 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2444,32 +2444,28 @@ ; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 ; GFX7-NEXT: s_addc_u32 s13, s13, 0 ; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_and_b32_e32 v9, 15, v2 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 28, v2 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_and_b32_e32 v16, 15, v0 ; GFX7-NEXT: v_bfe_u32 v4, v2, 24, 4 ; GFX7-NEXT: v_bfe_u32 v5, v2, 20, 4 ; GFX7-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX7-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX7-NEXT: v_bfe_u32 v8, v2, 4, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 12, v2 +; GFX7-NEXT: v_bfe_u32 v7, v2, 12, 4 +; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX7-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX7-NEXT: s_waitcnt vmcnt(1) ; GFX7-NEXT: v_lshrrev_b32_e32 v10, 28, v0 ; GFX7-NEXT: v_bfe_u32 v11, v0, 24, 4 ; GFX7-NEXT: v_bfe_u32 v12, v0, 20, 4 ; GFX7-NEXT: v_bfe_u32 v13, v0, 16, 4 -; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX7-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 12, v0 +; GFX7-NEXT: v_bfe_u32 v14, v0, 12, 4 +; GFX7-NEXT: v_bfe_u32 v15, v0, 8, 4 +; GFX7-NEXT: v_bfe_u32 v16, v0, 4, 4 +; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v1, v9, v16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, 0xf000000, v2 -; GFX7-NEXT: v_and_b32_e32 v0, 0xf000000, v0 -; GFX7-NEXT: v_mad_u32_u24 v1, v8, v15, v1 -; GFX7-NEXT: v_alignbit_b32 v2, s10, v2, 24 -; GFX7-NEXT: v_alignbit_b32 v0, 0, v0, 24 -; GFX7-NEXT: v_mad_u32_u24 v1, v7, v14, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 +; GFX7-NEXT: v_mad_u32_u24 v0, v9, v16, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v8, v15, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, v7, v14, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v6, v13, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v5, v12, v0 ; GFX7-NEXT: v_mad_u32_u24 v0, v4, v11, v0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -478,51 +478,49 @@ ; GFX67-SDAG-LABEL: clpeak_imad_pat_v3i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v7, 0, v7, 16 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v3, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v6, v5, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8 +; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1 ; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v3, 0, v8, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v0, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -738,41 +736,39 @@ ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3 +; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2 ; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v10, v7, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v9, 0, v9, 16 +; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v10, v7 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v8, v6, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v8, v6, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v5, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v9, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v11, v4, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v11, v4, 1 -; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v0, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6 ; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8 @@ -783,11 +779,11 @@ ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v8, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5 @@ -798,7 +794,7 @@ ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 -; GFX67-SDAG-NEXT: v_alignbit_b32 v5, 0, v0, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1395,51 +1391,49 @@ ; GFX67-SDAG-LABEL: clpeak_umad_pat_v3i16: ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v7, 16, v1 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v7, 0, v7, 16 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v7, v4, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v7, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v3, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v3, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v8, v4 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v6, v3, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v8, v4, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v3, 1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v6, v5, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v7, v0, v3 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v7, v5, v2 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v0, v3 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 -; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX67-SDAG-NEXT: v_or_b32_e32 v6, v9, v6 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v3, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8 +; GFX67-SDAG-NEXT: v_add_i32_e32 v6, vcc, s4, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v2, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v6, v6, v5, 1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v7, v5, 1 ; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v3, 0, v8, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v2, v5, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v4 ; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v7 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v0, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -1655,41 +1649,39 @@ ; GFX67-SDAG: ; %bb.0: ; %entry ; GFX67-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX67-SDAG-NEXT: v_add_i32_e32 v3, vcc, 1, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v3 +; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v3 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX67-SDAG-NEXT: v_add_i32_e32 v2, vcc, 1, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v11, v7, v3 +; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v2 ; GFX67-SDAG-NEXT: v_add_i32_e32 v1, vcc, 1, v1 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v3, v10, v7, v3 -; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX67-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX67-SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v9, 0, v9, 16 +; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v10, 0xffff, v1 +; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v10, v7 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v13, v11, v7 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v8, v6, 1 -; GFX67-SDAG-NEXT: v_and_b32_e32 v11, 0xffff, v0 -; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v8, v6, v2 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v9, v5, v1 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v7, v9, v6, 1 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v10, v5 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v2, v9, v6, v2 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v8, v4, v0 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v1, v10, v5, v1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v12, v9, v5 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v11, v4, v0 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v11, v4, 1 -; GFX67-SDAG-NEXT: v_alignbit_b32 v1, 0, v1, 16 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v13 +; GFX67-SDAG-NEXT: v_mad_u32_u24 v8, v8, v4, 1 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX67-SDAG-NEXT: v_or_b32_e32 v7, v9, v7 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v9, 16, v12 ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v9, v0, v4 +; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v10, v0, v4 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v5 ; GFX67-SDAG-NEXT: s_mov_b32 s4, 0x10000 -; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX67-SDAG-NEXT: v_or_b32_e32 v8, v9, v8 ; GFX67-SDAG-NEXT: v_mad_u32_u24 v0, v0, v4, 1 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v5, v2, v6 ; GFX67-SDAG-NEXT: v_add_i32_e32 v8, vcc, s4, v8 @@ -1700,11 +1692,11 @@ ; GFX67-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX67-SDAG-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; GFX67-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX67-SDAG-NEXT: v_alignbit_b32 v4, 0, v8, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX67-SDAG-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX67-SDAG-NEXT: v_and_b32_e32 v9, 0xffff, v10 ; GFX67-SDAG-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v1, v1, v4 ; GFX67-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v5 @@ -1715,7 +1707,7 @@ ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v8, v9, v8 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v4, v4, v5 ; GFX67-SDAG-NEXT: v_mul_u32_u24_e32 v3, v3, v6 -; GFX67-SDAG-NEXT: v_alignbit_b32 v5, 0, v0, 16 +; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX67-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 ; GFX67-SDAG-NEXT: v_and_b32_e32 v7, 0xffff, v8 ; GFX67-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll --- a/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll @@ -155,7 +155,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 0xa000000, v0 -; GCN-NEXT: v_alignbit_b32 v0, 0, v0, 25 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 25, v0 ; GCN-NEXT: v_add_u32_e32 v0, 55, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %value.knownbits2 = and i64 %x, 167772160 ; 0xA000000 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1873,38 +1873,38 @@ ; NOSDWA-NEXT: v_mov_b32_e32 v2, s2 ; NOSDWA-NEXT: v_mov_b32_e32 v3, s3 ; NOSDWA-NEXT: s_waitcnt vmcnt(0) -; NOSDWA-NEXT: v_lshrrev_b64 v[4:5], 24, v[0:1] -; NOSDWA-NEXT: v_and_b32_e32 v6, 0xff, v0 -; NOSDWA-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; NOSDWA-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; NOSDWA-NEXT: v_and_b32_e32 v5, 0xff, v1 +; NOSDWA-NEXT: v_and_b32_e32 v7, 0xff, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; NOSDWA-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NOSDWA-NEXT: v_lshlrev_b16_e32 v7, 8, v7 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v5, 8, v5 +; NOSDWA-NEXT: v_lshlrev_b16_e32 v6, 8, v6 ; NOSDWA-NEXT: v_and_b32_e32 v0, 0xff, v0 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v8, 8, v8 ; NOSDWA-NEXT: v_lshlrev_b16_e32 v9, 8, v9 ; NOSDWA-NEXT: v_and_b32_e32 v1, 0xff, v1 -; NOSDWA-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; NOSDWA-NEXT: v_or_b32_e32 v6, v6, v7 -; NOSDWA-NEXT: v_or_b32_e32 v5, v5, v8 +; NOSDWA-NEXT: v_or_b32_e32 v4, v4, v5 +; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v6 +; NOSDWA-NEXT: v_or_b32_e32 v5, v7, v8 ; NOSDWA-NEXT: v_or_b32_e32 v1, v1, v9 -; NOSDWA-NEXT: v_or_b32_e32 v0, v0, v4 -; NOSDWA-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; NOSDWA-NEXT: v_or_b32_e32 v0, v6, v0 -; NOSDWA-NEXT: v_or_b32_e32 v1, v4, v1 +; NOSDWA-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; NOSDWA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; NOSDWA-NEXT: v_or_b32_e32 v0, v4, v0 +; NOSDWA-NEXT: v_or_b32_e32 v1, v5, v1 ; NOSDWA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; NOSDWA-NEXT: s_endpgm ; ; GFX89-LABEL: pulled_out_test: ; GFX89: ; %bb.0: ; %entry ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX89-NEXT: v_mov_b32_e32 v6, 8 -; GFX89-NEXT: v_mov_b32_e32 v7, 0xff +; GFX89-NEXT: v_mov_b32_e32 v4, 8 +; GFX89-NEXT: v_mov_b32_e32 v5, 0xff ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: v_mov_b32_e32 v0, s0 ; GFX89-NEXT: v_mov_b32_e32 v1, s1 @@ -1912,73 +1912,72 @@ ; GFX89-NEXT: v_mov_b32_e32 v2, s2 ; GFX89-NEXT: v_mov_b32_e32 v3, s3 ; GFX89-NEXT: s_waitcnt vmcnt(0) -; GFX89-NEXT: v_lshrrev_b64 v[4:5], 24, v[0:1] -; GFX89-NEXT: v_lshrrev_b32_sdwa v8, v6, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v6, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX89-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX89-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX89-NEXT: v_and_b32_sdwa v5, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_lshlrev_b16_e32 v6, 8, v9 -; GFX89-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GFX89-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX89-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX89-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b16_e32 v6, 8, v7 +; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX89-NEXT: v_lshlrev_b16_e32 v4, 8, v9 +; GFX89-NEXT: v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX89-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX89-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX89-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX89-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX89-NEXT: s_endpgm ; ; GFX9-LABEL: pulled_out_test: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_sdwa v6, v5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_lshrrev_b32_sdwa v5, v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_and_b32_sdwa v3, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v8, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v5 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v3, 8, v7 +; GFX9-NEXT: v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: pulled_out_test: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 -; GFX10-NEXT: v_mov_b32_e32 v6, 0xff -; GFX10-NEXT: v_mov_b32_e32 v7, 24 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 24 +; GFX10-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v8, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v7, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v3, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v3, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v4, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v6, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX10-NEXT: s_endpgm entry: %idxprom = ashr exact i64 15, 32 diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -149,12 +149,11 @@ ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, 0x41 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x41 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -168,11 +167,10 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_lshr_b64 v[1:2], 33, v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v0, 33, v1, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 33, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2489,8 +2489,7 @@ ; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s0, v0 ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: v_alignbit_b32 v0, 0, vcc_lo, 1 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 ; GFX1032-NEXT: s_min_u32 s0, s0, s1 ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 @@ -2587,9 +2586,8 @@ ; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0 ; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: v_alignbit_b32 v1, 0, vcc_lo, 1 +; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 ; GFX1032-NEXT: s_min_u32 s0, s0, s1 ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 diff --git a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll --- a/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll +++ b/llvm/test/CodeGen/X86/2008-05-12-tailmerge-5.ll @@ -15,10 +15,9 @@ ; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %ah, {{[0-9]+}}(%rsp) -; CHECK-NEXT: shrq $16, %rsi -; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) -; CHECK-NEXT: shrq $24, %rax +; CHECK-NEXT: shrq $16, %rax ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb %ah, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movb %r8b, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll --- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll +++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll @@ -8,10 +8,10 @@ ; CHECK-NEXT: movl %edx, %edx ; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %r10d +; CHECK-NEXT: # kill: def $edx killed $edx def $rdx +; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $4, %r10d -; CHECK-NEXT: shrq $6, %rdx -; CHECK-NEXT: andl $67108860, %edx # imm = 0x3FFFFFC -; CHECK-NEXT: movl (%rdi,%rdx), %edx +; CHECK-NEXT: movl (%rdi,%rdx,4), %edx ; CHECK-NEXT: movzbl %dl, %edi ; CHECK-NEXT: shrl $8, %edx ; CHECK-NEXT: addl $5, %esi diff --git a/llvm/test/CodeGen/X86/3addr-or.ll b/llvm/test/CodeGen/X86/3addr-or.ll --- a/llvm/test/CodeGen/X86/3addr-or.ll +++ b/llvm/test/CodeGen/X86/3addr-or.ll @@ -20,12 +20,11 @@ define i64 @test2(i8 %A, i8 %B) nounwind { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: shll $4, %edi ; CHECK-NEXT: andl $48, %edi ; CHECK-NEXT: movzbl %sil, %eax -; CHECK-NEXT: shrq $4, %rax -; CHECK-NEXT: orq %rdi, %rax +; CHECK-NEXT: shrl $4, %eax +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq %C = zext i8 %A to i64 %D = shl i64 %C, 4 diff --git a/llvm/test/CodeGen/X86/and-shift.ll b/llvm/test/CodeGen/X86/and-shift.ll --- a/llvm/test/CodeGen/X86/and-shift.ll +++ b/llvm/test/CodeGen/X86/and-shift.ll @@ -54,8 +54,8 @@ ; X64-LABEL: shift30_and2_i64: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: shrq $30, %rax -; X64-NEXT: andl $2, %eax +; X64-NEXT: shrl $30, %eax +; X64-NEXT: andl $-2, %eax ; X64-NEXT: retq %shr = lshr i64 %x, 30 %and = and i64 %shr, 2 diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll --- a/llvm/test/CodeGen/X86/bswap.ll +++ b/llvm/test/CodeGen/X86/bswap.ll @@ -166,8 +166,8 @@ ; CHECK64-LABEL: not_bswap: ; CHECK64: # %bb.0: ; CHECK64-NEXT: movzwl var16(%rip), %eax -; CHECK64-NEXT: movq %rax, %rcx -; CHECK64-NEXT: shrq $8, %rcx +; CHECK64-NEXT: movl %eax, %ecx +; CHECK64-NEXT: shrl $8, %ecx ; CHECK64-NEXT: shlq $8, %rax ; CHECK64-NEXT: orq %rcx, %rax ; CHECK64-NEXT: retq @@ -224,9 +224,12 @@ ; ; CHECK64-LABEL: finally_useful_bswap: ; CHECK64: # %bb.0: -; CHECK64-NEXT: movzwl var16(%rip), %eax -; CHECK64-NEXT: bswapq %rax -; CHECK64-NEXT: shrq $48, %rax +; CHECK64-NEXT: movzwl var16(%rip), %ecx +; CHECK64-NEXT: movzbl %cl, %eax +; CHECK64-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: shlq $8, %rax +; CHECK64-NEXT: orq %rcx, %rax ; CHECK64-NEXT: retq %init = load i16, ptr @var16 %big = zext i16 %init to i64 diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll --- a/llvm/test/CodeGen/X86/combine-bitreverse.ll +++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll @@ -369,20 +369,19 @@ ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $235867919, %ecx # imm = 0xE0F0F0F ; X64-NEXT: shlq $4, %rcx -; X64-NEXT: shrq $4, %rax +; X64-NEXT: shrl $4, %eax ; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F ; X64-NEXT: orq %rcx, %rax ; X64-NEXT: movl %eax, %ecx ; X64-NEXT: andl $590558003, %ecx # imm = 0x23333333 -; X64-NEXT: shrq $2, %rax +; X64-NEXT: shrl $2, %eax ; X64-NEXT: andl $858993459, %eax # imm = 0x33333333 ; X64-NEXT: leaq (%rax,%rcx,4), %rax -; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555 -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: andq %rcx, %rdx -; X64-NEXT: shrq %rax -; X64-NEXT: andq %rcx, %rax -; X64-NEXT: leaq (%rax,%rdx,2), %rax +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: andl $357913941, %ecx # imm = 0x15555555 +; X64-NEXT: shrl %eax +; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555 +; X64-NEXT: leaq (%rax,%rcx,2), %rax ; X64-NEXT: retq %1 = call i64 @llvm.bitreverse.i64(i64 %a) %2 = shl i64 %1, 33 diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll @@ -1585,7 +1585,7 @@ ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: andl $2147483646, %eax # imm = 0x7FFFFFFE -; X64-NEXT: shrq %rax +; X64-NEXT: shrl %eax ; X64-NEXT: retq %t0 = and i64 %a0, 2147483647 %t1 = lshr i64 %t0, 1 @@ -1759,7 +1759,7 @@ ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: andl $2147483646, %eax # imm = 0x7FFFFFFE -; X64-NEXT: shrq %rax +; X64-NEXT: shrl %eax ; X64-NEXT: retq %t0 = and i64 %a0, 2147483647 %t1 = ashr i64 %t0, 1 diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -8130,22 +8130,22 @@ ; ; X64-NOBMI-LABEL: pr38938: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movq (%rsi), %rax -; X64-NOBMI-NEXT: shrq $19, %rax -; X64-NOBMI-NEXT: andl $4092, %eax # imm = 0xFFC -; X64-NOBMI-NEXT: incl (%rdi,%rax) +; X64-NOBMI-NEXT: movl (%rsi), %eax +; X64-NOBMI-NEXT: shrl $21, %eax +; X64-NOBMI-NEXT: andl $1023, %eax # imm = 0x3FF +; X64-NOBMI-NEXT: incl (%rdi,%rax,4) ; X64-NOBMI-NEXT: retq ; ; X64-BMINOTBM-LABEL: pr38938: ; X64-BMINOTBM: # %bb.0: ; X64-BMINOTBM-NEXT: movl $2581, %eax # imm = 0xA15 -; X64-BMINOTBM-NEXT: bextrq %rax, (%rsi), %rax +; X64-BMINOTBM-NEXT: bextrl %eax, (%rsi), %eax ; X64-BMINOTBM-NEXT: incl (%rdi,%rax,4) ; X64-BMINOTBM-NEXT: retq ; ; X64-BMITBM-LABEL: pr38938: ; X64-BMITBM: # %bb.0: -; X64-BMITBM-NEXT: bextrq $2581, (%rsi), %rax # imm = 0xA15 +; X64-BMITBM-NEXT: bextrl $2581, (%rsi), %eax # imm = 0xA15 ; X64-BMITBM-NEXT: incl (%rdi,%rax,4) ; X64-BMITBM-NEXT: retq %tmp = load i64, ptr %a1, align 8 diff --git a/llvm/test/CodeGen/X86/h-registers-0.ll b/llvm/test/CodeGen/X86/h-registers-0.ll --- a/llvm/test/CodeGen/X86/h-registers-0.ll +++ b/llvm/test/CodeGen/X86/h-registers-0.ll @@ -10,21 +10,21 @@ define void @bar64(i64 inreg %x, ptr inreg %p) nounwind { ; X64-LABEL: bar64: ; X64: # %bb.0: -; X64-NEXT: shrq $8, %rdi +; X64-NEXT: shrl $8, %edi ; X64-NEXT: incb %dil ; X64-NEXT: movb %dil, (%rsi) ; X64-NEXT: retq ; ; X32-LABEL: bar64: ; X32: # %bb.0: -; X32-NEXT: shrq $8, %rdi +; X32-NEXT: shrl $8, %edi ; X32-NEXT: incb %dil ; X32-NEXT: movb %dil, (%esi) ; X32-NEXT: retq ; ; WIN64-LABEL: bar64: ; WIN64: # %bb.0: -; WIN64-NEXT: shrq $8, %rcx +; WIN64-NEXT: shrl $8, %ecx ; WIN64-NEXT: incb %cl ; WIN64-NEXT: movb %cl, (%rdx) ; WIN64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -68,7 +68,7 @@ ; X64-BSR-LABEL: lshr_ctlz_undef_cmpeq_one_i64: ; X64-BSR: # %bb.0: ; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: shrq $6, %rax +; X64-BSR-NEXT: shrl $6, %eax ; X64-BSR-NEXT: cmpl $1, %eax ; X64-BSR-NEXT: sete %al ; X64-BSR-NEXT: retq @@ -76,7 +76,7 @@ ; X64-LZCNT-LABEL: lshr_ctlz_undef_cmpeq_one_i64: ; X64-LZCNT: # %bb.0: ; X64-LZCNT-NEXT: lzcntq %rdi, %rax -; X64-LZCNT-NEXT: shrq $6, %rax +; X64-LZCNT-NEXT: shrl $6, %eax ; X64-LZCNT-NEXT: cmpl $1, %eax ; X64-LZCNT-NEXT: sete %al ; X64-LZCNT-NEXT: retq @@ -149,7 +149,7 @@ ; X64-BSR-LABEL: lshr_ctlz_undef_cmpne_zero_i64: ; X64-BSR: # %bb.0: ; X64-BSR-NEXT: bsrq %rdi, %rax -; X64-BSR-NEXT: testq $-64, %rax +; X64-BSR-NEXT: testl $-64, %eax ; X64-BSR-NEXT: setne %al ; X64-BSR-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll --- a/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll +++ b/llvm/test/CodeGen/X86/zext-logicop-shift-load.ll @@ -89,9 +89,9 @@ ; ; X64-LABEL: test4: ; X64: # %bb.0: # %entry -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: shrq $2, %rax -; X64-NEXT: andl $60, %eax +; X64-NEXT: movzbl (%rdi), %eax +; X64-NEXT: shrl $2, %eax +; X64-NEXT: andl $-4, %eax ; X64-NEXT: retq entry: %bf.load = load i8, ptr %data, align 4 @@ -114,7 +114,7 @@ ; X64-LABEL: test5: ; X64: # %bb.0: # %entry ; X64-NEXT: movzbl (%rdi), %eax -; X64-NEXT: shrq $2, %rax +; X64-NEXT: shrl $2, %eax ; X64-NEXT: xorq $60, %rax ; X64-NEXT: retq entry: @@ -138,7 +138,7 @@ ; X64-LABEL: test6: ; X64: # %bb.0: # %entry ; X64-NEXT: movzbl (%rdi), %eax -; X64-NEXT: shrq $2, %rax +; X64-NEXT: shrl $2, %eax ; X64-NEXT: orq $60, %rax ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/zext-lshr.ll b/llvm/test/CodeGen/X86/zext-lshr.ll --- a/llvm/test/CodeGen/X86/zext-lshr.ll +++ b/llvm/test/CodeGen/X86/zext-lshr.ll @@ -42,7 +42,7 @@ ; X64-LABEL: i64_zext_shift_i16_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shrq $5, %rax +; X64-NEXT: shrl $5, %eax ; X64-NEXT: retq %t0 = zext i8 %a0 to i16 %t1 = lshr i16 %t0, 5 @@ -103,7 +103,7 @@ ; X64-LABEL: i128_zext_shift_i64_zext_i8: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: shrq $4, %rax +; X64-NEXT: shrl $4, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %t0 = zext i8 %a0 to i64 @@ -127,7 +127,7 @@ ; X64-LABEL: i128_zext_shift_i64_zext_i16: ; X64: # %bb.0: ; X64-NEXT: movzwl %di, %eax -; X64-NEXT: shrq $7, %rax +; X64-NEXT: shrl $7, %eax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: retq %t0 = zext i16 %a0 to i64