diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1406,6 +1406,8 @@ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSUB(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineADD_SUBToMADD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFMALike(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1360,6 +1360,9 @@ // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL, ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR}); + if (Subtarget.isISA3_0() && Subtarget.isPPC64()) { + setTargetDAGCombine({ISD::SUB}); + } if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC}); @@ -15106,6 +15109,8 @@ default: break; case ISD::ADD: return combineADD(N, DCI); + case ISD::SUB: + return combineSUB(N, DCI); case ISD::SHL: return combineSHL(N, DCI); case ISD::SRA: @@ -17125,6 +17130,133 @@ return MatPCRel; } +// Look for i128 multiply-add opportunities: +// A, B, C are i128 operands +// NumSignBits > 64 : ST +// NumSignBits == 64 && SignBitIsZero : ZT +// (add (mul A(ST) B(ST)) C(ST)) => +// (build_pair (maddld (trunc A), (trunc B), (trunc C)), +// (maddhd (trunc A), (trunc B), (trunc C))) +// (add (mul A(ZT) B(ZT)) C(ZT)) => +// (build_pair (maddld (trunc A), (trunc B), (trunc C)), +// (maddhdu (trunc A), (trunc B), (trunc C))) +// (sub (mul A(ST) B(ST)) C(ST)) => +// (build_pair (maddld (trunc A), (trunc B), -(trunc C)), +// (maddhd (trunc A), (trunc B), -(trunc C))) +// (sub C(ST) (mul A(ST) B(ST))) => +// (build_pair (maddld -(trunc A), (trunc B), (trunc C)), +// (maddhd -(trunc A), (trunc B), (trunc C))) +SDValue PPCTargetLowering::combineADD_SUBToMADD(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + if (!DCI.isBeforeLegalize()) + return SDValue(); + + if (!Subtarget.isPPC64() || !Subtarget.isISA3_0()) + return SDValue(); + + if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB) + return SDValue(); + + if (N->getValueType(0) != MVT::i128) + return SDValue(); + + SDValue MulOp = N->getOperand(0); + SDValue AddSubOp = N->getOperand(1); + + if (MulOp.getOpcode() != ISD::MUL) + std::swap(MulOp, AddSubOp); + + if (MulOp.getOpcode() != ISD::MUL) + return SDValue(); + + // Do not transform if there are other users of the mul. + if (!MulOp.hasOneUse()) + return SDValue(); + + // Each operand should have at least 64 sign bits, and the 64 sign bits case + // should have zero sign bit which map to unsigned, all others depend on + // SignBitIsZero. + auto IsLegalOperand = [&DAG](SDValue Op, unsigned &NumSignBits, + bool &SignBitIsZero, bool &IsUnsigned) -> bool { + NumSignBits = DAG.ComputeNumSignBits(Op); + if (NumSignBits < 64) + return false; + + SignBitIsZero = DAG.SignBitIsZero(Op); + if (NumSignBits == 64 && !SignBitIsZero) + return false; + + IsUnsigned = false; + if (SignBitIsZero) + IsUnsigned = true; + return true; + }; + + SmallVector MADDOps; + MADDOps.push_back(MulOp->getOperand(0)); + MADDOps.push_back(MulOp->getOperand(1)); + MADDOps.push_back(AddSubOp); + + SmallVector MADDOpsNumSignBits; + SmallVector MADDOpsSignBitIsZero; + bool IsUnsigned = false; + + for (unsigned i = 0; i < 3; ++i) { + unsigned OpNumSignBits = 0; + bool OpSignBitIsZero = false; + bool OpIsUnsigned = false; + if (!IsLegalOperand(MADDOps[i], OpNumSignBits, OpSignBitIsZero, + OpIsUnsigned)) { + return SDValue(); + } + MADDOpsNumSignBits.push_back(OpNumSignBits); + MADDOpsSignBitIsZero.push_back(OpSignBitIsZero); + if (i == 0) + IsUnsigned = OpIsUnsigned; + else if (OpIsUnsigned != IsUnsigned) + return SDValue(); + } + + // SUB unsigned does not worth the effort. + if (N->getOpcode() == ISD::SUB && IsUnsigned) + return SDValue(); + + SDLoc dl(N); + SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MADDOps[0]); + SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MADDOps[1]); + SDValue Op2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MADDOps[2]); + + // Move the minus sign for SUB. + if (N->getOpcode() == ISD::SUB) { + if (N->getOperand(0).getOpcode() == ISD::MUL) + Op2 = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(0, dl, MVT::i64), Op2); + else + Op0 = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(0, dl, MVT::i64), Op0); + } + + // For unsigned case (only ADD), in case multiply operands have total less + // than 64 bits, and the other add operand has less than 64 bits, then the + // higher half result will be zero. + bool MAddHIsZero = false; + if (IsUnsigned && + (((128 - MADDOpsNumSignBits[0]) + (128 - MADDOpsNumSignBits[1])) < 64) && + ((128 - MADDOpsNumSignBits[2]) < 64)) + MAddHIsZero = true; + + SDValue MAddL = + BuildIntrinsicOp(Intrinsic::ppc_maddld, Op0, Op1, Op2, DAG, dl); + SDValue MAddH = MAddHIsZero + ? DAG.getConstant(0, dl, MVT::i64) + : BuildIntrinsicOp(IsUnsigned ? Intrinsic::ppc_maddhdu + : Intrinsic::ppc_maddhd, + Op0, Op1, Op2, DAG, dl); + SDValue Combined = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128, MAddL, MAddH); + return Combined; +} + SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) return Value; @@ -17132,6 +17264,16 @@ if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget)) return Value; + if (auto Value = combineADD_SUBToMADD(N, DCI)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSUB(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = combineADD_SUBToMADD(N, DCI)) + return Value; + return SDValue(); } diff --git a/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll b/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll --- a/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll +++ b/llvm/test/CodeGen/PowerPC/add-sub-int128-madd.ll @@ -4,11 +4,9 @@ define i128 @add_int128(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_int128: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: adde 4, 4, 7 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -22,11 +20,9 @@ define i128 @add_int128_swap(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_int128_swap: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: addc 3, 5, 6 -; CHECK-P9-NEXT: adde 4, 7, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -41,12 +37,9 @@ ; CHECK-P9-LABEL: or_xor_add_int128: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: or 6, 3, 4 -; CHECK-P9-NEXT: xor 3, 5, 3 -; CHECK-P9-NEXT: mulld 7, 4, 6 -; CHECK-P9-NEXT: mulhd 4, 4, 6 -; CHECK-P9-NEXT: sradi 5, 3, 63 -; CHECK-P9-NEXT: addc 3, 7, 3 -; CHECK-P9-NEXT: adde 4, 4, 5 +; CHECK-P9-NEXT: xor 5, 5, 3 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -62,10 +55,9 @@ define i128 @add_unsigned_int128(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_unsigned_int128: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhdu 4, 4, 3 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: addze 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhdu 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = zext i64 %a to i128 @@ -79,11 +71,10 @@ define i128 @sub_int128_AxBmC(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: sub_int128_AxBmC: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: subc 3, 6, 5 -; CHECK-P9-NEXT: subfe 4, 7, 4 +; CHECK-P9-NEXT: neg 6, 5 +; CHECK-P9-NEXT: maddld 5, 4, 3, 6 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 6 +; CHECK-P9-NEXT: mr 3, 5 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -97,11 +88,10 @@ define i128 @sub_int128_CmAxB(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: sub_int128_CmAxB: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 7, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 6, 5, 63 -; CHECK-P9-NEXT: subc 3, 5, 7 -; CHECK-P9-NEXT: subfe 4, 4, 6 +; CHECK-P9-NEXT: neg 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %c to i128 @@ -192,9 +182,9 @@ define i128 @add_int128_highDWZero(i32 noundef %a, i31 noundef %b, i63 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_int128_highDWZero: ; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: clrldi 5, 5, 1 ; CHECK-P9-NEXT: clrldi 3, 3, 32 ; CHECK-P9-NEXT: clrldi 4, 4, 33 -; CHECK-P9-NEXT: clrldi 5, 5, 1 ; CHECK-P9-NEXT: maddld 3, 4, 3, 5 ; CHECK-P9-NEXT: li 4, 0 ; CHECK-P9-NEXT: blr @@ -210,14 +200,11 @@ define i128 @add_int128_sext_highDWNonZero(i16 noundef %a, i16 noundef %b, i16 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: add_int128_sext_highDWNonZero: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: extsh 3, 3 +; CHECK-P9-NEXT: extsh 5, 5 +; CHECK-P9-NEXT: extsh 6, 3 ; CHECK-P9-NEXT: extsh 4, 4 -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: extsh 3, 5 -; CHECK-P9-NEXT: sradi 5, 3, 63 -; CHECK-P9-NEXT: addc 3, 6, 3 -; CHECK-P9-NEXT: adde 4, 4, 5 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr entry: %conv = sext i16 %a to i128 diff --git a/llvm/test/CodeGen/PowerPC/mulld.ll b/llvm/test/CodeGen/PowerPC/mulld.ll --- a/llvm/test/CodeGen/PowerPC/mulld.ll +++ b/llvm/test/CodeGen/PowerPC/mulld.ll @@ -10,10 +10,9 @@ define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* nocapture readonly %b) { ; CHECK-LABEL: bn_mul_comba8: ; CHECK: mulhdu -; CHECK-NEXT: mulld -; CHECK: mulhdu -; CHECK: mulld -; CHECK-NEXT: mulhdu +; CHECK: maddhdu +; CHECK-NEXT: maddld +; CHECK: maddhdu ; CHECK-ITIN-LABEL: bn_mul_comba8: