diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1406,6 +1406,7 @@ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSUB(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFMALike(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1360,6 +1360,9 @@ // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL, ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR}); + if (Subtarget.isISA3_0() && Subtarget.isPPC64()) { + setTargetDAGCombine({ISD::SUB}); + } if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC}); @@ -15106,6 +15109,8 @@ default: break; case ISD::ADD: return combineADD(N, DCI); + case ISD::SUB: + return combineSUB(N, DCI); case ISD::SHL: return combineSHL(N, DCI); case ISD::SRA: @@ -17125,6 +17130,126 @@ return MatPCRel; } +// Look for i128 multiply-add opportunities: +// A, B, C are i128 operands +// NumSignBits > 64 : ST +// NumSignBits == 64 && SignBitIsZero : ZT +// (add (mul A(ST) B(ST)) C(ST)) => +// (build_pair (maddld (trunc A), (trunc B), (trunc C)), +// (maddhd (trunc A), (trunc B), (trunc C))) +// (add (mul A(ZT) B(ZT)) C(ZT)) => +// (build_pair (maddld (trunc A), (trunc B), (trunc C)), +// (maddhdu (trunc A), (trunc B), (trunc C))) +// (sub (mul A(ST) B(ST)) C(ST)) => +// (build_pair (maddld (trunc A), (trunc B), -(trunc C)), +// (maddhd (trunc A), (trunc B), -(trunc C))) +// (sub C(ST) (mul A(ST) B(ST))) => +// (build_pair (maddld -(trunc A), (trunc B), (trunc C)), +// (maddhd -(trunc A), (trunc B), (trunc C))) +static SDValue combineADD_SUBToMADD(SDNode *N, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) { + if (!Subtarget.isPPC64() || !Subtarget.isISA3_0()) + return SDValue(); + + if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB) + return SDValue(); + + if (N->getValueType(0) != MVT::i128) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::MUL && + N->getOperand(1).getOpcode() != ISD::MUL) + return SDValue(); + + SDValue MulOp = N->getOperand(0).getOpcode() == ISD::MUL ? N->getOperand(0) + : N->getOperand(1); + // Do not transform if there are other users of the mul. + if (!MulOp.hasOneUse()) + return SDValue(); + + SDValue MulLHS = MulOp->getOperand(0); + SDValue MulRHS = MulOp->getOperand(1); + SDValue AddSubOp = N->getOperand(0).getOpcode() == ISD::MUL + ? N->getOperand(1) + : N->getOperand(0); + + // Check multiply left operand: should have at least 64 sign bits, and the 64 + // sign bits case should have zero sign bit which map to unsigned, all others + // are signed. + unsigned MulLSignBits = DAG.ComputeNumSignBits(MulLHS); + if (MulLSignBits < 64) + return SDValue(); + + bool MulLSignBitIsZero = DAG.SignBitIsZero(MulLHS); + if (MulLSignBits == 64 && !MulLSignBitIsZero) + return SDValue(); + + bool IsUnsignedMulL = false; + if (MulLSignBits == 64 && MulLSignBitIsZero) + IsUnsignedMulL = true; + + // Check multiply right operand. + unsigned MulRSignBits = DAG.ComputeNumSignBits(MulRHS); + if (MulRSignBits < 64) + return SDValue(); + + bool MulRSignBitIsZero = DAG.SignBitIsZero(MulRHS); + if (MulRSignBits == 64 && !MulRSignBitIsZero) + return SDValue(); + + bool IsUnsignedMulR = false; + if (MulRSignBits == 64 && MulRSignBitIsZero) + IsUnsignedMulR = true; + + // Signed-ness should be the same. + if (IsUnsignedMulL != IsUnsignedMulR) + return SDValue(); + + // Check add/sub operand. + unsigned AddSubSignBits = DAG.ComputeNumSignBits(AddSubOp); + if (AddSubSignBits < 64) + return SDValue(); + + bool AddSubSignBitIsZero = DAG.SignBitIsZero(AddSubOp); + if (AddSubSignBits == 64 && !AddSubSignBitIsZero) + return SDValue(); + + bool IsUnsignedAddSub = false; + if (AddSubSignBits == 64 && AddSubSignBitIsZero) + IsUnsignedAddSub = true; + + // Signed-ness should be the same. + if (IsUnsignedMulL != IsUnsignedAddSub) + return SDValue(); + + // SUB unsigned does not worth the effort. + if (N->getOpcode() == ISD::SUB && IsUnsignedAddSub) + return SDValue(); + + SDLoc dl(N); + SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MulLHS); + SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MulRHS); + SDValue Op2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, AddSubOp); + + // Move the minus sign for SUB. + if (N->getOpcode() == ISD::SUB) { + if (N->getOperand(0).getOpcode() == ISD::MUL) + Op2 = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(0, dl, MVT::i64), Op2); + else + Op0 = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(0, dl, MVT::i64), Op0); + } + + SDValue MAddL = + BuildIntrinsicOp(Intrinsic::ppc_maddld, Op0, Op1, Op2, DAG, dl); + SDValue MAddH = BuildIntrinsicOp(IsUnsignedAddSub ? Intrinsic::ppc_maddhdu + : Intrinsic::ppc_maddhd, + Op0, Op1, Op2, DAG, dl); + SDValue Combined = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128, MAddL, MAddH); + return Combined; +} + SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) return Value; @@ -17132,6 +17257,16 @@ if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget)) return Value; + if (auto Value = combineADD_SUBToMADD(N, DCI.DAG, Subtarget)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSUB(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = combineADD_SUBToMADD(N, DCI.DAG, Subtarget)) + return Value; + return SDValue(); } diff --git a/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll b/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll --- a/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll +++ b/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll @@ -4,11 +4,9 @@ define i128 @add_int128(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_int128: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: adde 4, 4, 7 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -23,12 +21,9 @@ ; CHECK-P9-LABEL: or_xor_add_int128: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: or 6, 3, 4 -; CHECK-P9-NEXT: xor 3, 5, 3 -; CHECK-P9-NEXT: mulld 7, 4, 6 -; CHECK-P9-NEXT: mulhd 4, 4, 6 -; CHECK-P9-NEXT: sradi 5, 3, 63 -; CHECK-P9-NEXT: addc 3, 7, 3 -; CHECK-P9-NEXT: adde 4, 4, 5 +; CHECK-P9-NEXT: xor 5, 5, 3 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -44,10 +39,9 @@ define i128 @add_unsigned_int128(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_unsigned_int128: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhdu 4, 4, 3 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: addze 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhdu 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = zext i64 %a to i128 @@ -61,11 +55,10 @@ define i128 @sub_int128_AxBmC(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: sub_int128_AxBmC: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: subc 3, 6, 5 -; CHECK-P9-NEXT: subfe 4, 7, 4 +; CHECK-P9-NEXT: neg 6, 5 +; CHECK-P9-NEXT: maddld 5, 4, 3, 6 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 6 +; CHECK-P9-NEXT: mr 3, 5 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -79,11 +72,10 @@ define i128 @sub_int128_CmAxB(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: sub_int128_CmAxB: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 7, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 6, 5, 63 -; CHECK-P9-NEXT: subc 3, 5, 7 -; CHECK-P9-NEXT: subfe 4, 4, 6 +; CHECK-P9-NEXT: neg 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %c to i128 diff --git a/llvm/test/CodeGen/PowerPC/mulld.ll b/llvm/test/CodeGen/PowerPC/mulld.ll --- a/llvm/test/CodeGen/PowerPC/mulld.ll +++ b/llvm/test/CodeGen/PowerPC/mulld.ll @@ -10,10 +10,9 @@ define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* nocapture readonly %b) { ; CHECK-LABEL: bn_mul_comba8: ; CHECK: mulhdu -; CHECK-NEXT: mulld -; CHECK: mulhdu -; CHECK: mulld -; CHECK-NEXT: mulhdu +; CHECK: maddhdu +; CHECK-NEXT: maddld +; CHECK: maddhdu ; CHECK-ITIN-LABEL: bn_mul_comba8: