diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1415,6 +1415,7 @@ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineADDToMADD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFMALike(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -17227,6 +17227,112 @@ return MatPCRel; } +// Look for i128 multiply-add opportunities: +// A, B, C are i128 operands +// NumSignBits >= 65: +// (add (mul A B) C) => (build_pair (maddld (trunc A), (trunc B), (trunc C)), +// (maddhd (trunc A), (trunc B), (trunc C))) +// NumSignBits == 64 && SignBitIsZero: +// (add (mul A B) C) => (build_pair (maddld (trunc A), (trunc B), (trunc C)), +// (maddhdu (trunc A), (trunc B), (trunc C))) +SDValue PPCTargetLowering::combineADDToMADD(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + if (!DCI.isBeforeLegalize()) + return SDValue(); + + if (!Subtarget.isPPC64() || !Subtarget.isISA3_0()) + return SDValue(); + + if (N->getOpcode() != ISD::ADD) + return SDValue(); + + if (N->getValueType(0) != MVT::i128) + return SDValue(); + + SDValue MulOp = N->getOperand(0); + SDValue AddOp = N->getOperand(1); + + if (MulOp.getOpcode() != ISD::MUL) + std::swap(MulOp, AddOp); + + if (MulOp.getOpcode() != ISD::MUL) + return SDValue(); + + // Do not transform if there are other users of the mul. + if (!MulOp.hasOneUse()) + return SDValue(); + + // Operands have at least 65 NumSignBits can be handled by madd signed. + // Operands have 64 NumSignBits and proved SignBitIsZero should be handled + // by madd unsigned. + auto IsLegalOperand = [&DAG](SDValue Op, unsigned &NumSignBits, + bool &IsUnsigned) -> bool { + IsUnsigned = false; + if (DAG.SignBitIsZero(Op)) + IsUnsigned = true; + + NumSignBits = DAG.ComputeNumSignBits(Op); + // Madd signed case. + if (NumSignBits >= 65) + return true; + + // Madd unsigned case. + if (NumSignBits == 64 && IsUnsigned) + return true; + + return false; + }; + + SmallVector MADDOps; + MADDOps.push_back(MulOp->getOperand(0)); + MADDOps.push_back(MulOp->getOperand(1)); + MADDOps.push_back(AddOp); + + SmallVector MADDOpsNumSignBits; + bool AllUnsigned = true; + bool ForceUnsigned = false; + + for (unsigned i = 0; i < 3; ++i) { + unsigned OpNumSignBits = 0; + bool OpIsUnsigned = false; + if (!IsLegalOperand(MADDOps[i], OpNumSignBits, OpIsUnsigned)) { + return SDValue(); + } + MADDOpsNumSignBits.push_back(OpNumSignBits); + ForceUnsigned |= (OpNumSignBits == 64 && OpIsUnsigned); + AllUnsigned &= OpIsUnsigned; + } + + // Check for Madd unsigned/signed conflict. + if (ForceUnsigned && !AllUnsigned) + return SDValue(); + + SDLoc dl(N); + SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MADDOps[0]); + SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MADDOps[1]); + SDValue Op2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, MADDOps[2]); + + // For unsigned case, in case multiply operands have total less than 64 bits, + // and the other add operand has less than 64 bits, then the higher half + // result will be zero. + bool MAddHIsZero = false; + if (AllUnsigned && + (((128 - MADDOpsNumSignBits[0]) + (128 - MADDOpsNumSignBits[1])) < 64) && + ((128 - MADDOpsNumSignBits[2]) < 64)) + MAddHIsZero = true; + + SDValue MAddL = + BuildIntrinsicOp(Intrinsic::ppc_maddld, Op0, Op1, Op2, DAG, dl); + SDValue MAddH = MAddHIsZero + ? DAG.getConstant(0, dl, MVT::i64) + : BuildIntrinsicOp(ForceUnsigned ? Intrinsic::ppc_maddhdu + : Intrinsic::ppc_maddhd, + Op0, Op1, Op2, DAG, dl); + SDValue Combined = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128, MAddL, MAddH); + return Combined; +} + SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const { if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget)) return Value; @@ -17234,6 +17340,9 @@ if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget)) return Value; + if (auto Value = combineADDToMADD(N, DCI)) + return Value; + return SDValue(); } diff --git a/llvm/test/CodeGen/PowerPC/add-int128-madd.ll b/llvm/test/CodeGen/PowerPC/add-int128-madd.ll --- a/llvm/test/CodeGen/PowerPC/add-int128-madd.ll +++ b/llvm/test/CodeGen/PowerPC/add-int128-madd.ll @@ -5,11 +5,9 @@ define i128 @add_int64_sext(i64 noundef %a, i64 noundef %b, i64 noundef %c) { ; CHECK-P9-LABEL: add_int64_sext: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: adde 4, 4, 7 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_int64_sext: @@ -32,10 +30,9 @@ define i128 @add_int64_zext(i64 noundef %a, i64 noundef %b, i64 noundef %c) { ; CHECK-P9-LABEL: add_int64_zext: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhdu 4, 4, 3 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: addze 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhdu 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_int64_zext: @@ -89,11 +86,9 @@ define i128 @add_int64_swap(i64 noundef %a, i64 noundef %b, i64 noundef %c) { ; CHECK-P9-LABEL: add_int64_swap: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: addc 3, 5, 6 -; CHECK-P9-NEXT: adde 4, 7, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_int64_swap: @@ -116,12 +111,10 @@ define i128 @add_mix_zext(i63 noundef %a, i63 noundef %b, i64 noundef %c) { ; CHECK-P9-LABEL: add_mix_zext: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: clrldi 3, 3, 1 +; CHECK-P9-NEXT: clrldi 6, 3, 1 ; CHECK-P9-NEXT: clrldi 4, 4, 1 -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhdu 4, 4, 3 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: addze 4, 4 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhdu 4, 4, 6, 5 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_mix_zext: @@ -146,16 +139,13 @@ ; CHECK-P9-LABEL: add_int63_sext: ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: sldi 3, 3, 1 -; CHECK-P9-NEXT: sldi 4, 4, 1 -; CHECK-P9-NEXT: sradi 3, 3, 1 -; CHECK-P9-NEXT: sradi 4, 4, 1 -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sldi 3, 5, 1 -; CHECK-P9-NEXT: sradi 5, 3, 1 -; CHECK-P9-NEXT: sradi 7, 3, 63 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: adde 4, 4, 7 +; CHECK-P9-NEXT: sldi 5, 5, 1 +; CHECK-P9-NEXT: sradi 6, 3, 1 +; CHECK-P9-NEXT: sldi 3, 4, 1 +; CHECK-P9-NEXT: sradi 5, 5, 1 +; CHECK-P9-NEXT: sradi 4, 3, 1 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_int63_sext: @@ -184,13 +174,11 @@ define i128 @add_int63_zext(i63 noundef %a, i63 noundef %b, i63 noundef %c) { ; CHECK-P9-LABEL: add_int63_zext: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: clrldi 3, 3, 1 +; CHECK-P9-NEXT: clrldi 5, 5, 1 +; CHECK-P9-NEXT: clrldi 6, 3, 1 ; CHECK-P9-NEXT: clrldi 4, 4, 1 -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhdu 4, 4, 3 -; CHECK-P9-NEXT: clrldi 3, 5, 1 -; CHECK-P9-NEXT: addc 3, 6, 3 -; CHECK-P9-NEXT: addze 4, 4 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_int63_zext: @@ -215,16 +203,13 @@ define i128 @add_int63_hybrid_ext(i63 noundef %a, i63 noundef %b, i63 noundef %c) { ; CHECK-P9-LABEL: add_int63_hybrid_ext: ; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: sldi 5, 5, 1 ; CHECK-P9-NEXT: sldi 3, 3, 1 +; CHECK-P9-NEXT: sradi 5, 5, 1 ; CHECK-P9-NEXT: clrldi 4, 4, 1 -; CHECK-P9-NEXT: sradi 3, 3, 1 -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sldi 3, 5, 1 -; CHECK-P9-NEXT: sradi 5, 3, 1 -; CHECK-P9-NEXT: sradi 7, 3, 63 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: adde 4, 4, 7 +; CHECK-P9-NEXT: sradi 6, 3, 1 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_int63_hybrid_ext: @@ -291,9 +276,9 @@ define i128 @add_highDWZero(i32 noundef %a, i31 noundef %b, i63 noundef %c) { ; CHECK-P9-LABEL: add_highDWZero: ; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: clrldi 5, 5, 1 ; CHECK-P9-NEXT: clrldi 3, 3, 32 ; CHECK-P9-NEXT: clrldi 4, 4, 33 -; CHECK-P9-NEXT: clrldi 5, 5, 1 ; CHECK-P9-NEXT: maddld 3, 4, 3, 5 ; CHECK-P9-NEXT: li 4, 0 ; CHECK-P9-NEXT: blr @@ -319,14 +304,11 @@ define i128 @add_sext_highDWNonZero(i16 noundef %a, i16 noundef %b, i16 noundef %c) { ; CHECK-P9-LABEL: add_sext_highDWNonZero: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: extsh 3, 3 +; CHECK-P9-NEXT: extsh 5, 5 +; CHECK-P9-NEXT: extsh 6, 3 ; CHECK-P9-NEXT: extsh 4, 4 -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: extsh 3, 5 -; CHECK-P9-NEXT: sradi 5, 3, 63 -; CHECK-P9-NEXT: addc 3, 6, 3 -; CHECK-P9-NEXT: adde 4, 4, 5 +; CHECK-P9-NEXT: maddld 3, 4, 6, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 6, 5 ; CHECK-P9-NEXT: blr ; ; CHECK-P8-LABEL: add_sext_highDWNonZero: diff --git a/llvm/test/CodeGen/PowerPC/mulld.ll b/llvm/test/CodeGen/PowerPC/mulld.ll --- a/llvm/test/CodeGen/PowerPC/mulld.ll +++ b/llvm/test/CodeGen/PowerPC/mulld.ll @@ -10,10 +10,9 @@ define void @bn_mul_comba8(ptr nocapture %r, ptr nocapture readonly %a, ptr nocapture readonly %b) { ; CHECK-LABEL: bn_mul_comba8: ; CHECK: mulhdu -; CHECK-NEXT: mulld -; CHECK: mulhdu -; CHECK: mulld -; CHECK-NEXT: mulhdu +; CHECK: maddhdu +; CHECK-NEXT: maddld +; CHECK: maddhdu ; CHECK-ITIN-LABEL: bn_mul_comba8: