diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1002,7 +1002,7 @@ SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -521,6 +521,9 @@ setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::CTPOP, MVT::i128, Custom); + setOperationAction(ISD::PARITY, MVT::i64, Custom); + setOperationAction(ISD::PARITY, MVT::i128, Custom); + setOperationAction(ISD::ABS, MVT::i32, Custom); setOperationAction(ISD::ABS, MVT::i64, Custom); @@ -5463,7 +5466,8 @@ case ISD::SRA_PARTS: return LowerShiftParts(Op, DAG); case ISD::CTPOP: - return LowerCTPOP(Op, DAG); + case ISD::PARITY: + return LowerCTPOP_PARITY(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::OR: @@ -7783,7 +7787,7 @@ return BitCast(VT, BSP, DAG); } -SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { +SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const { if (DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); @@ -7791,6 +7795,8 @@ if (!Subtarget->hasNEON()) return SDValue(); + bool IsParity = Op.getOpcode() == ISD::PARITY; + // While there is no integer popcount instruction, it can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from @@ -7813,6 +7819,10 @@ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + if (IsParity) + UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, + DAG.getConstant(1, DL, MVT::i32)); + if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; @@ -7824,9 +7834,15 @@ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); + if (IsParity) + UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV, + DAG.getConstant(1, DL, MVT::i32)); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV); } + assert(!IsParity && "ISD::PARITY of vector types not supported"); + if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU); @@ -20016,7 +20032,8 @@ return; case ISD::CTPOP: - if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG)) + case ISD::PARITY: + if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG)) Results.push_back(Result); return; case AArch64ISD::SADDV: diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll --- a/llvm/test/CodeGen/AArch64/parity.ll +++ b/llvm/test/CodeGen/AArch64/parity.ll @@ -77,13 +77,11 @@ define i64 @parity_64(i64 %x) { ; CHECK-LABEL: parity_64: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x0, x0, lsr #32 -; CHECK-NEXT: eor x8, x8, x8, lsr #16 -; CHECK-NEXT: eor x8, x8, x8, lsr #8 -; CHECK-NEXT: eor x8, x8, x8, lsr #4 -; CHECK-NEXT: eor x8, x8, x8, lsr #2 -; CHECK-NEXT: eor w8, w8, w8, lsr #1 -; CHECK-NEXT: and x0, x8, #0x1 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.ctpop.i64(i64 %x) %2 = and i64 %1, 1 @@ -93,15 +91,13 @@ define i128 @parity_128(i128 %x) { ; CHECK-LABEL: parity_128: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x0, x1 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: mov v0.d[1], x1 ; CHECK-NEXT: mov x1, xzr -; CHECK-NEXT: eor x8, x8, x8, lsr #32 -; CHECK-NEXT: eor x8, x8, x8, lsr #16 -; CHECK-NEXT: eor x8, x8, x8, lsr #8 -; CHECK-NEXT: eor x8, x8, x8, lsr #4 -; CHECK-NEXT: eor x8, x8, x8, lsr #2 -; CHECK-NEXT: eor w8, w8, w8, lsr #1 -; CHECK-NEXT: and x0, x8, #0x1 +; CHECK-NEXT: cnt v0.16b, v0.16b +; CHECK-NEXT: uaddlv h0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call i128 @llvm.ctpop.i128(i128 %x) %2 = and i128 %1, 1 @@ -111,12 +107,10 @@ define i32 @parity_64_trunc(i64 %x) { ; CHECK-LABEL: parity_64_trunc: ; CHECK: // %bb.0: -; CHECK-NEXT: eor x8, x0, x0, lsr #32 -; CHECK-NEXT: eor x8, x8, x8, lsr #16 -; CHECK-NEXT: eor x8, x8, x8, lsr #8 -; CHECK-NEXT: eor x8, x8, x8, lsr #4 -; CHECK-NEXT: eor x8, x8, x8, lsr #2 -; CHECK-NEXT: eor w8, w8, w8, lsr #1 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: cnt v0.8b, v0.8b +; CHECK-NEXT: uaddlv h0, v0.8b +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.ctpop.i64(i64 %x)