Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -465,6 +465,10 @@ return true; } + /// Return true if the @llvm.experimental.cttz.elts intrinsic should be + /// expanded using generic code in SelectionDAGBuilder. + virtual bool shouldExpandCttzElements(EVT VT) const { return true; } + // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to // vecreduce(op(x, y)) for the reduction opcode RedOpc. virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const { Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -2171,6 +2171,11 @@ [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>, ImmArg>]>; +def int_experimental_cttz_elts: + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty, llvm_i32_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7488,6 +7488,64 @@ setValue(&I, Trunc); return; } + case Intrinsic::experimental_cttz_elts: { + auto DL = getCurSDLoc(); + SDValue Op = getValue(I.getOperand(0)); + EVT OpVT = Op.getValueType(); + + if (!TLI.shouldExpandCttzElements(OpVT)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + if (OpVT.getScalarType() != MVT::i1) { + // Compare the input vector elements to zero & use to count trailing zeros + SDValue AllZero = DAG.getConstant(0, DL, OpVT); + OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + OpVT.getVectorElementCount()); + Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE); + } + + // Find the smallest "sensible" element type to use for the expansion. + ConstantRange CR( + APInt(64, OpVT.getVectorElementCount().getKnownMinValue())); + if (OpVT.isScalableVT()) + CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64)); + + unsigned EltWidth = I.getType()->getScalarSizeInBits(); + EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits()); + EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8); + + MVT NewEltTy = MVT::getIntegerVT(EltWidth); + + // Create the new vector type & get the vector length + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy, + OpVT.getVectorElementCount()); + + SDValue VL = + DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount()); + + SDValue StepVec = DAG.getStepVector(DL, NewVT); + SDValue SplatVL = DAG.getSplat(NewVT, DL, VL); + SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op); + SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext); + SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And); + SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max); + + // If the result is VL, then the input was all zero. Return UNDEF in this + // case if zero-is-poison is set. + if (cast(getValue(I.getOperand(1)))->getZExtValue() != 0) { + Sub = DAG.getSelectCC(DL, Sub, VL, DAG.getUNDEF(NewEltTy), Sub, + ISD::SETEQ); + } + + EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, DL, RetTy, Sub); + + setValue(&I, Ret); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -927,6 +927,8 @@ bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; + bool shouldExpandCttzElements(EVT VT) const override; + /// If a change in streaming mode is required on entry to/return from a /// function call it emits and returns the corresponding SMSTART or SMSTOP node. /// \p Entry tells whether this is before/after the Call, which is necessary Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1757,6 +1757,13 @@ return false; } +bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { + if (!Subtarget->hasSVE() || VT != MVT::nxv16i1) + return true; + + return false; +} + void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT, bool StreamingSVE) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); @@ -5314,6 +5321,21 @@ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID, Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::experimental_cttz_elts: { + // Only lower this when zero-is-poison is false + assert(cast(getValue(I.getOperand(1)))->getZExtValue() != + 0); + + EVT Ty = Op.getValueType(); + if (Ty == MVT::i64) + return Op; + + SDValue NewCttzElts = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, Op.getOperand(0), + Op.getOperand(1), Op.getOperand(2)); + + return DAG.getZExtOrTrunc(NewCttzElts, dl, Ty); + } } } Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1964,6 +1964,11 @@ defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>; defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>; defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>; + + def : Pat<(i64 (int_experimental_cttz_elts nxv16i1:$Op1, (i32 0))), + (i64 (!cast(CNTP_XPP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)), + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>; } defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>; Index: llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll @@ -0,0 +1,268 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; FIXED WIDTH + +define i8 @ctz_v8i1(<8 x i1> %a) { +; CHECK-LABEL: .LCPI0_0: +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i32 0) + ret i8 %res +} + +define i32 @ctz_v16i1(<16 x i1> %a) { +; CHECK-LABEL: .LCPI1_0: +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 15 +; CHECK-NEXT: .byte 14 +; CHECK-NEXT: .byte 13 +; CHECK-NEXT: .byte 12 +; CHECK-NEXT: .byte 11 +; CHECK-NEXT: .byte 10 +; CHECK-NEXT: .byte 9 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i32 0) + ret i32 %res +} + +define i16 @ctz_v4i32(<4 x i32> %a) { +; CHECK-LABEL: .LCPI2_0: +; CHECK-NEXT: .hword 4 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 2 +; CHECK-NEXT: .hword 1 +; CHECK-LABEL: ctz_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i32 0) + ret i16 %res +} + +; ZERO IS POISON + +define i8 @ctz_v8i1_poison(<8 x i1> %a) { +; CHECK-LABEL: .LCPI3_0: +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v8i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w9, w8, #0xff +; CHECK-NEXT: cmp w9, #8 +; CHECK-NEXT: csel w0, w8, w8, eq +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i32 1) + ret i8 %res +} + +; SCALABLE, WITH VSCALE RANGE + +define i64 @ctz_nxv8i1( %a) #0 { +; CHECK-LABEL: ctz_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: inch z0.h +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( %a, i32 0) + ret i64 %res +} + +define i32 @ctz_nxv32i1( %a) #0 { +; CHECK-LABEL: ctz_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: punpklo p3.h, p1.b +; CHECK-NEXT: rdvl x9, #2 +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: rdvl x8, #-1 +; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: inch z0.h, all, mul #4 +; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p2.h +; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: add z4.h, z0.h, z2.h +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.h, z1.h, z2.h +; CHECK-NEXT: and z3.d, z4.d, z5.d +; CHECK-NEXT: and z1.d, z1.d, z6.d +; CHECK-NEXT: and z2.d, z2.d, z7.d +; CHECK-NEXT: umax z0.h, p2/m, z0.h, z3.h +; CHECK-NEXT: umax z1.h, p2/m, z1.h, z2.h +; CHECK-NEXT: umax z0.h, p2/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p2, z0.h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( %a, i32 0) + ret i32 %res +} + +define i32 @ctz_nxv4i32( %a) #0 { +; CHECK-LABEL: ctz_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: index z1.s, #0, #-1 +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: incw z1.s +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32( %a, i32 0) + ret i32 %res +} + +; SCALABLE, NO VSCALE RANGE + +define i32 @ctz_nxv8i1_no_range( %a) { +; CHECK-LABEL: ctz_nxv8i1_no_range: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: incw z0.s, all, mul #2 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add z1.s, z0.s, z1.s +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( %a, i32 0) + ret i32 %res +} + +; MATCH WITH BRKB + CNTP + +define i32 @ctz_nxv16i1( %pg, %a) { +; CHECK-LABEL: ctz_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %a, i32 0) + ret i32 %res +} + +define i32 @ctz_and_nxv16i1( %pg, %a, %b) { +; CHECK-LABEL: ctz_and_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %cmp = icmp ne %a, %b + %select = select %pg, %cmp, zeroinitializer + %and = and %pg, %select + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %and, i32 0) + ret i32 %res +} + +declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1>, i32) +declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i32) +declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i32) + +declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(, i32) +declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(, i32) +declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(, i32) +declare i32 @llvm.experimental.cttz.elts.i32.nxv32i1(, i32) +declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(, i32) + +attributes #0 = { vscale_range(1,16) }