diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1112,6 +1112,7 @@ SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -391,6 +391,7 @@ if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) { setOperationAction(ISD::CTPOP, MVT::i32 , Legal); setOperationAction(ISD::CTPOP, MVT::i64 , Legal); + setOperationAction(ISD::CTPOP, MVT::i128 , Custom); } else { setOperationAction(ISD::CTPOP, MVT::i32 , Expand); setOperationAction(ISD::CTPOP, MVT::i64 , Expand); @@ -9655,6 +9656,20 @@ return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins); } + +/// LowerCTPOP - Custom lowering for CTPOP(i128) to vector.reduce.add(CTPOP(v2i64)) +SDValue PPCTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::CTPOP && "Should only be called for ISD::CTPOP"); + assert((Op.getValueType() == MVT::i128) && + "Only set i128 as custom, other type shouldn't reach here!"); + SDLoc dl(Op); + SDValue N0 = peekThroughBitcasts(Op.getOperand(0)); + SDValue BitCast = DAG.getBitcast(MVT::v2i64, N0); + SDValue CTPOP = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, BitCast); + SDValue Add = SDValue(DAG.getMachineNode(PPC::VADDUDM, dl, MVT::v2i64, CTPOP, CTPOP), 0); + return DAG.getBitcast(MVT::i128, Add); +} + /// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8). /// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is /// a multiple of 8. Otherwise convert it to a scalar rotation(i128) @@ -10987,6 +11002,7 @@ case ISD::ABS: return LowerABS(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::ROTL: return LowerROTL(Op, DAG); + case ISD::CTPOP: return LowerCTPOP(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -11076,6 +11092,12 @@ case ISD::BITCAST: // Don't handle bitcast here. return; + case ISD::CTPOP: { + SDValue Lowered = LowerCTPOP(SDValue(N, 0), DAG); + if (Lowered) + Results.push_back(Lowered); + return; + } case ISD::FP_EXTEND: SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG); if (Lowered) diff --git a/llvm/test/CodeGen/PowerPC/popcount.ll b/llvm/test/CodeGen/PowerPC/popcount.ll --- a/llvm/test/CodeGen/PowerPC/popcount.ll +++ b/llvm/test/CodeGen/PowerPC/popcount.ll @@ -5,12 +5,14 @@ define i8 @popcount128(i128* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount128: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: ld 4, 0(3) -; CHECK-NEXT: ld 3, 8(3) -; CHECK-NEXT: popcntd 3, 3 -; CHECK-NEXT: popcntd 4, 4 -; CHECK-NEXT: add 3, 4, 3 -; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3 +; CHECK-NEXT: lxvd2x 0, 0, 3 +; CHECK-NEXT: xxswapd 0, 0 +; CHECK-NEXT: xxlor 34, 0, 0 +; CHECK-NEXT: vpopcntd 2, 2 +; CHECK-NEXT: vaddudm 2, 2, 2 +; CHECK-NEXT: xxswapd 0, 34 +; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 +; CHECK-NEXT: mffprwz 3, 0 ; CHECK-NEXT: clrldi 3, 3, 56 ; CHECK-NEXT: blr Entry: @@ -27,16 +29,23 @@ define i16 @popcount256(i256* nocapture nonnull readonly %0) { ; CHECK-LABEL: popcount256: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: ld 4, 0(3) -; CHECK-NEXT: ld 5, 8(3) -; CHECK-NEXT: ld 6, 16(3) -; CHECK-NEXT: ld 3, 24(3) -; CHECK-NEXT: popcntd 3, 3 -; CHECK-NEXT: popcntd 6, 6 -; CHECK-NEXT: add 3, 6, 3 -; CHECK-NEXT: popcntd 5, 5 -; CHECK-NEXT: popcntd 4, 4 -; CHECK-NEXT: add 4, 4, 5 +; CHECK-NEXT: li 4, 16 +; CHECK-NEXT: lxvd2x 0, 3, 4 +; CHECK-NEXT: xxswapd 0, 0 +; CHECK-NEXT: xxlor 34, 0, 0 +; CHECK-NEXT: lxvd2x 0, 0, 3 +; CHECK-NEXT: xxswapd 0, 0 +; CHECK-NEXT: xxlor 35, 0, 0 +; CHECK-NEXT: vpopcntd 2, 2 +; CHECK-NEXT: vaddudm 2, 2, 2 +; CHECK-NEXT: xxswapd 0, 34 +; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 +; CHECK-NEXT: mffprd 3, 0 +; CHECK-NEXT: vpopcntd 2, 3 +; CHECK-NEXT: vaddudm 2, 2, 2 +; CHECK-NEXT: xxswapd 0, 34 +; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 +; CHECK-NEXT: mffprd 4, 0 ; CHECK-NEXT: add 3, 4, 3 ; CHECK-NEXT: # kill: def $r3 killed $r3 killed $x3 ; CHECK-NEXT: clrldi 3, 3, 48 @@ -54,21 +63,8 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) { ; CHECK-LABEL: popcount1x128: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: xxlor 0, 34, 34 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 -; CHECK-NEXT: mffprd 3, 0 -; CHECK-NEXT: popcntd 3, 3 -; CHECK-NEXT: xxswapd 0, 34 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $vsl0 -; CHECK-NEXT: mffprd 4, 0 -; CHECK-NEXT: popcntd 4, 4 -; CHECK-NEXT: add 3, 4, 3 -; CHECK-NEXT: mtfprd 0, 3 -; CHECK-NEXT: # kill: def $vsl0 killed $f0 -; CHECK-NEXT: li 3, 0 -; CHECK-NEXT: mtfprd 1, 3 -; CHECK-NEXT: # kill: def $vsl1 killed $f1 -; CHECK-NEXT: xxmrghd 34, 1, 0 +; CHECK-NEXT: vpopcntd 2, 2 +; CHECK-NEXT: vaddudm 2, 2, 2 ; CHECK-NEXT: blr Entry: %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)