diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10691,7 +10691,14 @@ return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad); } -// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z). +// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high +// multiplies, if any. +// +// Full 64-bit multiplies that feed into an addition are lowered here instead +// of using the generic expansion. The generic expansion ends up with +// a tree of ADD nodes that prevents us from using the "add" part of the +// MAD instruction. The expansion produced here results in a chain of ADDs +// instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::ADD); @@ -10705,6 +10712,11 @@ if (VT.isVector()) return SDValue(); + // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall + // result in scalar registers for uniform values. + if (!N->isDivergent() && Subtarget->hasSMulHi()) + return SDValue(); + unsigned NumBits = VT.getScalarSizeInBits(); if (NumBits <= 32 || NumBits > 64) return SDValue(); @@ -10714,27 +10726,96 @@ std::swap(LHS, RHS); } + // Avoid the fold if it would unduly increase the number of multiplies due to + // multiple uses, except on hardware with full-rate multiply-add (which is + // part of full-rate 64-bit ops). + if (!Subtarget->hasFullRate64Ops()) { + unsigned NumUsers = 0; + for (SDNode *Use : LHS->uses()) { + // There is a use that does not feed into addition, so the multiply can't + // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. + if (Use->getOpcode() != ISD::ADD) + return SDValue(); + + // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer + // MUL + 3xADD + 3xADDC over 3xMAD. + ++NumUsers; + if (NumUsers >= 3) + return SDValue(); + } + } + SDValue MulLHS = LHS.getOperand(0); SDValue MulRHS = LHS.getOperand(1); SDValue AddRHS = RHS; - // TODO: Maybe restrict if SGPR inputs. - if (numBitsUnsigned(MulLHS, DAG) <= 32 && - numBitsUnsigned(MulRHS, DAG) <= 32) { - MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32); - MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32); - AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64); - return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); + // Always check whether operands are small unsigned values, since that + // knowledge is useful in more cases. Check for small signed values only if + // doing so can unlock a shorter code sequence. + bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32; + bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32; + + bool MulSignedLo = false; + if (!MulLHSUnsigned32 || !MulRHSUnsigned32) { + MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 && + numBitsSigned(MulRHS, DAG) <= 32; } - if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) { - MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); - MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); - AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); - return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true); + // The operands and final result all have the same number of bits. If + // operands need to be extended, they can be extended with garbage. The + // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is + // truncated away in the end. + if (VT != MVT::i64) { + MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS); + MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS); + AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS); } - return SDValue(); + // The basic code generated is conceptually straightforward. Pseudo code: + // + // accum = mad_64_32 lhs.lo, rhs.lo, accum + // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi + // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi + // + // The second and third lines are optional, depending on whether the factors + // are {sign,zero}-extended or not. + // + // The actual DAG is noisier than the pseudo code, but only due to + // instructions that disassemble values into low and high parts, and + // assemble the final result. + SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + SDValue One = DAG.getConstant(1, SL, MVT::i32); + + auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS); + auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS); + SDValue Accum = + getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo); + + if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) { + auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero); + auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One); + + if (!MulLHSUnsigned32) { + auto MulLHSHi = + DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One); + SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo); + AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); + } + + if (!MulRHSUnsigned32) { + auto MulRHSHi = + DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One); + SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi); + AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi); + } + + Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi}); + Accum = DAG.getBitcast(MVT::i64, Accum); + } + + if (VT != MVT::i64) + Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum); + return Accum; } SDValue SITargetLowering::performAddCombine(SDNode *N, diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -830,17 +830,16 @@ ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -874,17 +873,16 @@ ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -922,16 +920,14 @@ ; GFX1064-NEXT: .LBB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s6, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2 -; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1] +; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1064-NEXT: s_endpgm ; @@ -968,16 +964,14 @@ ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0 ; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s6, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2 -; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1] +; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -968,18 +968,17 @@ ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: s_mov_b32 s4, s0 +; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -1009,18 +1008,17 @@ ; GFX9-NEXT: .LBB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: s_mov_b32 s4, s0 +; GFX9-NEXT: s_mov_b32 s5, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -1052,16 +1050,14 @@ ; GFX1064-NEXT: .LBB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, s[4:5] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v2 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1092,16 +1088,14 @@ ; GFX1032-NEXT: .LBB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4 -; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v2 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5] ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo +; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -211,8 +211,6 @@ ; CI-LABEL: mad_i64_i32_sextops_i32_i63: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; CI-NEXT: v_ashr_i64 v[2:3], v[2:3], 1 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -228,8 +226,6 @@ ; GFX9-LABEL: mad_i64_i32_sextops_i32_i63: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_ashrrev_i64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i63 @@ -243,9 +239,7 @@ ; CI-LABEL: mad_i64_i32_sextops_i31_i63: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; CI-NEXT: v_bfe_i32 v1, v1, 0, 31 -; CI-NEXT: v_ashr_i64 v[2:3], v[2:3], 1 ; CI-NEXT: v_bfe_i32 v0, v0, 0, 31 ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] ; CI-NEXT: s_setpc_b64 s[30:31] @@ -266,8 +260,6 @@ ; GFX9-LABEL: mad_i64_i32_sextops_i31_i63: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_ashrrev_i64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 31 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] @@ -285,10 +277,8 @@ ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; CI-NEXT: v_mul_lo_u32 v4, v4, v1 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, 0 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] +; CI-NEXT: v_add_i32_e32 v1, vcc, v4, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_i64_i32_extops_i32_i64: @@ -308,10 +298,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v4, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, v4, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %ext0 = sext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 @@ -352,12 +340,10 @@ ; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 1, v1 -; CI-NEXT: v_mul_lo_u32 v3, v1, v2 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CI-NEXT: v_and_b32_e32 v3, 1, v1 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] +; CI-NEXT: v_mul_lo_u32 v2, v3, v2 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small: @@ -375,12 +361,10 @@ ; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 %trunc.rhs = and i64 %arg1, 4294967295 @@ -393,12 +377,11 @@ ; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 1, v3 -; CI-NEXT: v_mul_lo_u32 v3, v0, v1 -; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; CI-NEXT: v_mov_b32_e32 v6, v0 +; CI-NEXT: v_and_b32_e32 v3, 1, v3 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] +; CI-NEXT: v_mul_lo_u32 v2, v6, v3 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small: @@ -416,12 +399,11 @@ ; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, v1 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v2, 0 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 8589934591 @@ -530,9 +512,11 @@ ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1] -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, v[0:1] +; GFX9-NEXT: s_mul_i32 s0, s2, s3 +; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 +; GFX9-NEXT: s_add_u32 s0, s0, s4 +; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] ; GFX9-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 @@ -587,9 +571,13 @@ ; CI-LABEL: mad_i64_i32_thrice: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3] -; CI-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5] -; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7] +; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, v0, v4 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v1, v5, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v6 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc ; CI-NEXT: v_xor_b32_e32 v3, v3, v5 ; CI-NEXT: v_xor_b32_e32 v2, v2, v4 ; CI-NEXT: v_xor_b32_e32 v1, v3, v1 @@ -639,10 +627,11 @@ ; CI-LABEL: mad_i64_i32_secondary_use: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0 -; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3] -; CI-NEXT: v_xor_b32_e32 v1, v1, v5 -; CI-NEXT: v_xor_b32_e32 v0, v0, v4 +; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0 +; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc +; CI-NEXT: v_xor_b32_e32 v1, v3, v1 +; CI-NEXT: v_xor_b32_e32 v0, v2, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; SI-LABEL: mad_i64_i32_secondary_use: @@ -672,5 +661,46 @@ ret i64 %out } +define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { +; CI-LABEL: mad_i48_i48: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v6, v1 +; CI-NEXT: v_mov_b32_e32 v7, v0 +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5] +; CI-NEXT: v_mul_lo_u32 v2, v6, v2 +; CI-NEXT: v_mul_lo_u32 v3, v7, v3 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: mad_i48_i48: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mul_lo_u32 v3, v0, v3 +; SI-NEXT: v_mul_hi_u32 v6, v0, v2 +; SI-NEXT: v_mul_lo_u32 v1, v1, v2 +; SI-NEXT: v_mul_lo_u32 v0, v0, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; SI-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, v1, v5, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: mad_i48_i48: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5] +; GFX9-NEXT: v_mul_lo_u32 v3, v7, v3 +; GFX9-NEXT: v_mul_lo_u32 v2, v6, v2 +; GFX9-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %m = mul i48 %arg0, %arg1 + %a = add i48 %m, %arg2 + ret i48 %a +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone speculatable }