diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1406,6 +1406,7 @@ SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineADDESUBE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFMALike(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1360,6 +1360,9 @@ // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL, ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR}); + if (Subtarget.isISA3_0() && Subtarget.isPPC64()) { + setTargetDAGCombine({ISD::ADDE, ISD::SUBE}); + } if (Subtarget.hasFPCVT()) setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC}); @@ -15105,6 +15108,9 @@ default: break; case ISD::ADD: return combineADD(N, DCI); + case ISD::ADDE: + case ISD::SUBE: + return combineADDESUBE(N, DCI); case ISD::SHL: return combineSHL(N, DCI); case ISD::SRA: @@ -17134,6 +17140,150 @@ return SDValue(); } +// Look for multiply-add opportunities. The pattern is marked by glued +// ADDC/ADDE, SUBC/SUBE pairs with multiply and multiply high nodes on one side, +// and constant zero or splatted sign bit together with the other node on the +// other side. +// +// Pattern 1/2: +// (a MUL b) [ADD|SUB]C (c) carry-out --> (MADDLD a, b, [+|-]c) +// | ==> +// (a MULHS b) [ADD|SUB]E (SRA c, 63) carry-in <--- (MADDHD a, b, [+|-]c) +// +// Pattern 3: +// (a MUL b) ADDC (c) carry-out --> (MADDLD a, b, c) +// | ==> +// (a MULHU b) ADDE (0) carry-in <--- (MADDHDU a, b, c) +// +// Pattern 4: +// (c) SUBC (a MUL b) carry-out --> (MADDLD -a, b, c) +// | ==> +// (SRA c, 63) SUBE (a MULHS b) carry-in <--- (MADDHD -a, b, c) +SDValue PPCTargetLowering::combineADDESUBE(SDNode *N, + DAGCombinerInfo &DCI) const { + if (!Subtarget.isISA3_0() || !Subtarget.isPPC64()) + return SDValue(); + + if (N->getOpcode() != ISD::ADDE && N->getOpcode() != ISD::SUBE) + return SDValue(); + + SDNode *ASEN = N; + SDNode *ASCN = N->getOperand(2).getNode(); + + if (ASEN->getValueType(0) != MVT::i64 || ASCN->getValueType(0) != MVT::i64) + return SDValue(); + + // Check it should be ADDC/ADDE or SUBC/SUBE pair glued together. + if ((ASEN->getOpcode() == ISD::ADDE && ASCN->getOpcode() != ISD::ADDC) || + (ASEN->getOpcode() == ISD::SUBE && ASCN->getOpcode() != ISD::SUBC)) + return SDValue(); + + SDValue ASCOp0 = ASCN->getOperand(0); + SDValue ASCOp1 = ASCN->getOperand(1); + SDValue ASEOp0 = ASEN->getOperand(0); + SDValue ASEOp1 = ASEN->getOperand(1); + + // Check that two operands should be from different nodes. + if (ASCOp0.getNode() == ASCOp1.getNode() || + ASEOp0.getNode() == ASEOp1.getNode()) + return SDValue(); + + SDValue MHOp; + SDValue ASHOp; + SDValue MLOp; + SDValue ASLOp; + int MHIdx = -1; + int MLIdx = -1; + if (ASEOp0->getOpcode() == ISD::MULHS || ASEOp0->getOpcode() == ISD::MULHU) { + MHOp = ASEOp0; + ASHOp = ASEOp1; + MHIdx = 0; + } else if (ASEOp1->getOpcode() == ISD::MULHS || + ASEOp1->getOpcode() == ISD::MULHU) { + MHOp = ASEOp1; + ASHOp = ASEOp0; + MHIdx = 1; + } else + return SDValue(); + + if (ASCOp0->getOpcode() == ISD::MUL) { + MLOp = ASCOp0; + ASLOp = ASCOp1; + MLIdx = 0; + } else if (ASCOp1->getOpcode() == ISD::MUL) { + MLOp = ASCOp1; + ASLOp = ASCOp0; + MLIdx = 1; + } else + return SDValue(); + + // Node c cannot be (a MULH[S|U] b) or [ADD|SUB]E node. + if (ASLOp.getNode() == MHOp.getNode() || ASLOp.getNode() == ASEN) + return SDValue(); + + // SUBE unsigned does not worth the effort. + if (ASEN->getOpcode() == ISD::SUBE && MHOp->getOpcode() == ISD::MULHU) + return SDValue(); + + // SUBC/SUBE signed must have multiply nodes on the same side. + if (ASEN->getOpcode() == ISD::SUBE && MHOp->getOpcode() == ISD::MULHS && + MHIdx != MLIdx) + return SDValue(); + + // Check (SRA c, 63) for signed. + if (MHOp->getOpcode() == ISD::MULHS) { + if (ASHOp->getOpcode() != ISD::SRA) + return SDValue(); + if (auto *CN = dyn_cast(ASHOp.getOperand(1))) + if (CN->getZExtValue() != 63) + return SDValue(); + if (ASLOp.getNode() != ASHOp.getOperand(0).getNode()) + return SDValue(); + } + + // Check constant zero for unsigned. + if (MHOp->getOpcode() == ISD::MULHU) { + auto *CN = dyn_cast(ASHOp); + if (!CN || CN->getZExtValue() != 0) + return SDValue(); + } + + // Check that multiply and multiply high nodes have the same pair of operands. + if (!(((MHOp.getOperand(0).getNode() == MLOp.getOperand(0).getNode()) && + (MHOp.getOperand(1).getNode() == MLOp.getOperand(1).getNode())) || + ((MHOp.getOperand(0).getNode() == MLOp.getOperand(1).getNode()) && + (MHOp.getOperand(1).getNode() == MLOp.getOperand(0).getNode())))) + return SDValue(); + + SDValue Op0 = MHOp.getOperand(0); + SDValue Op1 = MHOp.getOperand(1); + SDValue Op2 = ASLOp; + + // Move the minus sign for SUBE. + if (ASEN->getOpcode() == ISD::SUBE) { + if (MHIdx == 0) + Op2 = DCI.DAG.getNode( + ISD::SUB, SDLoc(Op2.getNode()), MVT::i64, + DCI.DAG.getConstant(0, SDLoc(Op2.getNode()), MVT::i64), Op2); + else + Op0 = DCI.DAG.getNode( + ISD::SUB, SDLoc(Op0.getNode()), MVT::i64, + DCI.DAG.getConstant(0, SDLoc(Op0.getNode()), MVT::i64), Op0); + } + + SDValue NHNode = + BuildIntrinsicOp(MHOp->getOpcode() == ISD::MULHS ? Intrinsic::ppc_maddhd + : Intrinsic::ppc_maddhdu, + Op0, Op1, Op2, DCI.DAG, SDLoc(ASEN)); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(ASEN, 0), NHNode); + + SDValue NLNode = BuildIntrinsicOp(Intrinsic::ppc_maddld, Op0, Op1, Op2, + DCI.DAG, SDLoc(ASCN)); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(ASCN, 0), NLNode); + + return SDValue(ASEN, 0); +} + // Detect TRUNCATE operations on bitcasts of float128 values. // What we are looking for here is the situtation where we extract a subset // of bits from a 128 bit float. diff --git a/llvm/test/CodeGen/PowerPC/adde-sube-int128-madd.ll b/llvm/test/CodeGen/PowerPC/adde-sube-int128-madd.ll --- a/llvm/test/CodeGen/PowerPC/adde-sube-int128-madd.ll +++ b/llvm/test/CodeGen/PowerPC/adde-sube-int128-madd.ll @@ -4,11 +4,9 @@ define i128 @adde_int128(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: adde_int128: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: adde 4, 4, 7 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -22,10 +20,9 @@ define i128 @adde_unsigned_int128(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: adde_unsigned_int128: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhdu 4, 4, 3 -; CHECK-P9-NEXT: addc 3, 6, 5 -; CHECK-P9-NEXT: addze 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhdu 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = zext i64 %a to i128 @@ -39,11 +36,10 @@ define i128 @sube_int128_AxBmC(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: sube_int128_AxBmC: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 6, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 7, 5, 63 -; CHECK-P9-NEXT: subc 3, 6, 5 -; CHECK-P9-NEXT: subfe 4, 7, 4 +; CHECK-P9-NEXT: neg 6, 5 +; CHECK-P9-NEXT: maddld 5, 4, 3, 6 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 6 +; CHECK-P9-NEXT: mr 3, 5 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %a to i128 @@ -57,11 +53,10 @@ define i128 @sube_int128_CmAxB(i64 noundef %a, i64 noundef %b, i64 noundef %c) local_unnamed_addr #0 { ; CHECK-P9-LABEL: sube_int128_CmAxB: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: mulld 7, 4, 3 -; CHECK-P9-NEXT: mulhd 4, 4, 3 -; CHECK-P9-NEXT: sradi 6, 5, 63 -; CHECK-P9-NEXT: subc 3, 5, 7 -; CHECK-P9-NEXT: subfe 4, 4, 6 +; CHECK-P9-NEXT: neg 4, 4 +; CHECK-P9-NEXT: maddld 6, 4, 3, 5 +; CHECK-P9-NEXT: maddhd 4, 4, 3, 5 +; CHECK-P9-NEXT: mr 3, 6 ; CHECK-P9-NEXT: blr entry: %conv = sext i64 %c to i128 diff --git a/llvm/test/CodeGen/PowerPC/mulld.ll b/llvm/test/CodeGen/PowerPC/mulld.ll --- a/llvm/test/CodeGen/PowerPC/mulld.ll +++ b/llvm/test/CodeGen/PowerPC/mulld.ll @@ -10,10 +10,9 @@ define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* nocapture readonly %b) { ; CHECK-LABEL: bn_mul_comba8: ; CHECK: mulhdu -; CHECK-NEXT: mulld -; CHECK: mulhdu -; CHECK: mulld -; CHECK-NEXT: mulhdu +; CHECK: maddhdu +; CHECK-NEXT: maddld +; CHECK: maddhdu ; CHECK-ITIN-LABEL: bn_mul_comba8: