diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -541,12 +541,19 @@ setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::CTPOP, MVT::i32, Custom); - setOperationAction(ISD::CTPOP, MVT::i64, Custom); - setOperationAction(ISD::CTPOP, MVT::i128, Custom); + if (Subtarget->hasCSSC()) { + setOperationAction(ISD::CTPOP, MVT::i32, Legal); + setOperationAction(ISD::CTPOP, MVT::i64, Legal); + setOperationAction(ISD::CTPOP, MVT::i128, Expand); + setOperationAction(ISD::PARITY, MVT::i128, Expand); + } else { + setOperationAction(ISD::CTPOP, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i64, Custom); + setOperationAction(ISD::CTPOP, MVT::i128, Custom); - setOperationAction(ISD::PARITY, MVT::i64, Custom); - setOperationAction(ISD::PARITY, MVT::i128, Custom); + setOperationAction(ISD::PARITY, MVT::i64, Custom); + setOperationAction(ISD::PARITY, MVT::i128, Custom); + } setOperationAction(ISD::ABS, MVT::i32, Custom); setOperationAction(ISD::ABS, MVT::i64, Custom); @@ -8413,8 +8420,16 @@ return SDValue(); bool IsParity = Op.getOpcode() == ISD::PARITY; + SDValue Val = Op.getOperand(0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); - // While there is no integer popcount instruction, it can + // for i32, general parity function using EORs is more efficient compared to + // using floating point + if (VT == MVT::i32 && IsParity) + return SDValue(); + + // If there is no CNT instruction available, GPR popcount can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from // the AdvSIMD registers are cheap. @@ -8422,10 +8437,6 @@ // CNT V0.8B, V0.8B // 8xbyte pop-counts // ADDV B0, V0.8B // sum 8xbyte pop-counts // UMOV X0, V0.B[0] // copy byte result back to integer reg - SDValue Val = Op.getOperand(0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - if (VT == MVT::i32 || VT == MVT::i64) { if (VT == MVT::i32) Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8529,7 +8529,7 @@ // General Data-Processing Instructions (FEAT_V94_DP) //===----------------------------------------------------------------------===// defm ABS : OneOperandData<0b001000, "abs">, Requires<[HasCSSC]>; -defm CNT : OneOperandData<0b000111, "cnt">, Requires<[HasCSSC]>; +defm CNT : OneOperandData<0b000111, "cnt", ctpop>, Requires<[HasCSSC]>; defm CTZ : OneOperandData<0b000110, "ctz">, Requires<[HasCSSC]>; defm SMAX : ComparisonOp<0, 0, "smax">, Requires<[HasCSSC]>; diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll --- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s ; RUN: llc < %s -mtriple=aarch64-eabi -mattr -neon -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-NONEON %s +; RUN: llc < %s -mtriple=aarch64-eabi -mattr +cssc -aarch64-neon-syntax=apple | FileCheck -check-prefix=CHECK-CSSC %s define i32 @cnt32_advsimd(i32 %x) nounwind readnone { ; CHECK-LABEL: cnt32_advsimd: @@ -27,6 +28,11 @@ ; CHECK-NONEON-NEXT: mul w8, w9, w8 ; CHECK-NONEON-NEXT: lsr w0, w8, #24 ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: cnt32_advsimd: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt w0, w0 +; CHECK-CSSC-NEXT: ret %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt } @@ -57,6 +63,13 @@ ; CHECK-NONEON-NEXT: mul w8, w9, w8 ; CHECK-NONEON-NEXT: lsr w0, w8, #24 ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: cnt32_advsimd_2: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-CSSC-NEXT: fmov w8, s0 +; CHECK-CSSC-NEXT: cnt w0, w8 +; CHECK-CSSC-NEXT: ret %1 = extractelement <2 x i32> %x, i64 0 %2 = tail call i32 @llvm.ctpop.i32(i32 %1) ret i32 %2 @@ -86,6 +99,11 @@ ; CHECK-NONEON-NEXT: mul x8, x9, x8 ; CHECK-NONEON-NEXT: lsr x0, x8, #56 ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: cnt64_advsimd: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x0, x0 +; CHECK-CSSC-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } @@ -125,6 +143,11 @@ ; CHECK-NONEON-NEXT: mul w8, w9, w8 ; CHECK-NONEON-NEXT: lsr w0, w8, #24 ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: cnt32: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt w0, w0 +; CHECK-CSSC-NEXT: ret %cnt = tail call i32 @llvm.ctpop.i32(i32 %x) ret i32 %cnt } @@ -161,6 +184,11 @@ ; CHECK-NONEON-NEXT: mul x8, x9, x8 ; CHECK-NONEON-NEXT: lsr x0, x8, #56 ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: cnt64: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x0, x0 +; CHECK-CSSC-NEXT: ret %cnt = tail call i64 @llvm.ctpop.i64(i64 %x) ret i64 %cnt } @@ -181,6 +209,13 @@ ; CHECK-NONEON-NEXT: ccmp x0, #0, #4, eq ; CHECK-NONEON-NEXT: cset w0, ne ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: ctpop_eq_one: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x8, x0 +; CHECK-CSSC-NEXT: cmp x8, #1 +; CHECK-CSSC-NEXT: cset w0, eq +; CHECK-CSSC-NEXT: ret %count = tail call i64 @llvm.ctpop.i64(i64 %x) %cmp = icmp eq i64 %count, 1 %conv = zext i1 %cmp to i32 @@ -203,6 +238,13 @@ ; CHECK-NONEON-NEXT: ccmp x0, #0, #4, eq ; CHECK-NONEON-NEXT: cset w0, eq ; CHECK-NONEON-NEXT: ret +; +; CHECK-CSSC-LABEL: ctpop_ne_one: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x8, x0 +; CHECK-CSSC-NEXT: cmp x8, #1 +; CHECK-CSSC-NEXT: cset w0, ne +; CHECK-CSSC-NEXT: ret %count = tail call i64 @llvm.ctpop.i64(i64 %x) %cmp = icmp ne i64 %count, 1 %conv = zext i1 %cmp to i32 diff --git a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll --- a/llvm/test/CodeGen/AArch64/ctpop-nonean.ll +++ b/llvm/test/CodeGen/AArch64/ctpop-nonean.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon -mattr=+cssc < %s | FileCheck %s -check-prefix=CHECK-CSSC declare i128 @llvm.ctpop.i128(i128) @@ -31,6 +32,14 @@ ; CHECK-NEXT: lsr x9, x9, #56 ; CHECK-NEXT: add x0, x9, x8, lsr #56 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: ctpop_i128: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x8, x1 +; CHECK-CSSC-NEXT: cnt x9, x0 +; CHECK-CSSC-NEXT: add x0, x9, x8 +; CHECK-CSSC-NEXT: mov x1, xzr +; CHECK-CSSC-NEXT: ret %c = call i128 @llvm.ctpop.i128(i128 %i) ret i128 %c } diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll --- a/llvm/test/CodeGen/AArch64/parity.ll +++ b/llvm/test/CodeGen/AArch64/parity.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mattr=+cssc | FileCheck %s -check-prefix=CHECK-CSSC define i4 @parity_4(i4 %x) { ; CHECK-LABEL: parity_4: @@ -9,6 +10,13 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_4: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: and w8, w0, #0xf +; CHECK-CSSC-NEXT: cnt w8, w8 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i4 @llvm.ctpop.i4(i4 %x) %2 = and i4 %1, 1 ret i4 %2 @@ -23,6 +31,13 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_8: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: and w8, w0, #0xff +; CHECK-CSSC-NEXT: cnt w8, w8 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i8 @llvm.ctpop.i8(i8 %x) %2 = and i8 %1, 1 ret i8 %2 @@ -38,6 +53,13 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_16: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: and w8, w0, #0xffff +; CHECK-CSSC-NEXT: cnt w8, w8 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i16 @llvm.ctpop.i16(i16 %x) %2 = and i16 %1, 1 ret i16 %2 @@ -54,6 +76,13 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_17: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: and w8, w0, #0x1ffff +; CHECK-CSSC-NEXT: cnt w8, w8 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i17 @llvm.ctpop.i17(i17 %x) %2 = and i17 %1, 1 ret i17 %2 @@ -69,6 +98,12 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_32: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt w8, w0 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i32 @llvm.ctpop.i32(i32 %x) %2 = and i32 %1, 1 ret i32 %2 @@ -83,6 +118,12 @@ ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_64: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x8, x0 +; CHECK-CSSC-NEXT: and x0, x8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i64 @llvm.ctpop.i64(i64 %x) %2 = and i64 %1, 1 ret i64 %2 @@ -99,6 +140,14 @@ ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_128: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: eor x8, x0, x1 +; CHECK-CSSC-NEXT: mov x1, xzr +; CHECK-CSSC-NEXT: cnt x8, x8 +; CHECK-CSSC-NEXT: and x0, x8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i128 @llvm.ctpop.i128(i128 %x) %2 = and i128 %1, 1 ret i128 %2 @@ -113,6 +162,12 @@ ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_64_trunc: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt x8, x0 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i64 @llvm.ctpop.i64(i64 %x) %2 = trunc i64 %1 to i32 %3 = and i32 %2, 1 @@ -129,6 +184,12 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_32_trunc: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: cnt w8, w0 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %1 = tail call i32 @llvm.ctpop.i32(i32 %x) %2 = trunc i32 %1 to i8 %3 = and i8 %2, 1 @@ -144,6 +205,13 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_8_zext: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: and w8, w0, #0xff +; CHECK-CSSC-NEXT: cnt w8, w8 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %a = zext i8 %x to i32 %b = tail call i32 @llvm.ctpop.i32(i32 %a) %c = and i32 %b, 1 @@ -159,6 +227,13 @@ ; CHECK-NEXT: eor w8, w8, w8, lsr #1 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret +; +; CHECK-CSSC-LABEL: parity_8_mask: +; CHECK-CSSC: // %bb.0: +; CHECK-CSSC-NEXT: and w8, w0, #0xff +; CHECK-CSSC-NEXT: cnt w8, w8 +; CHECK-CSSC-NEXT: and w0, w8, #0x1 +; CHECK-CSSC-NEXT: ret %a = and i32 %x, 255 %b = tail call i32 @llvm.ctpop.i32(i32 %a) %c = and i32 %b, 1