Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -18337,6 +18337,46 @@ Both arguments must be vectors of the same type whereby their logical concatenation matches the result type. +'``llvm.experimental.cttz.elts``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ```llvm.experimental.cttz.elts``` +on any vector of integer elements, both fixed width and scalable. Not all +targets support all bit widths or vector types, however. + +:: + + declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> , i1 ) + +Overview: +""""""""" + +The '``llvm.experimental.cttz.elts``' intrinsic counts the number of trailing +zero elements of a vector. + +Arguments: +"""""""""" + +The first argument is the vector to be counted. This argument must be a vector +with integer element type. The return type must also be an integer type which is +wide enough to hold the maximum number of elements of the source vector. The +behaviour of this intrinsic is undefined if the return type is not wide enough +for the number of elements in the input vector. + +The second argument is a constant flag that indicates whether the intrinsic +returns a valid result if the first argument is all zero. If the first argument +is all zero and the second argument is true, the result is poison. + +Semantics: +"""""""""" + +The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least +significant) zero elements in a vector. If ``src == 0`` then the result is +the number of elements in the input vector. + '``llvm.experimental.vector.splice``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -465,6 +465,10 @@ return true; } + /// Return true if the @llvm.experimental.cttz.elts intrinsic should be + /// expanded using generic code in SelectionDAGBuilder. + virtual bool shouldExpandCttzElements(EVT VT) const { return true; } + // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to // vecreduce(op(x, y)) for the reduction opcode RedOpc. virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const { Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -2167,6 +2167,11 @@ [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>, ImmArg>]>; +def int_experimental_cttz_elts: + DefaultAttrsIntrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty, llvm_i1_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7496,6 +7496,62 @@ setValue(&I, Trunc); return; } + case Intrinsic::experimental_cttz_elts: { + auto DL = getCurSDLoc(); + SDValue Op = getValue(I.getOperand(0)); + EVT OpVT = Op.getValueType(); + + if (!TLI.shouldExpandCttzElements(OpVT)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + if (OpVT.getScalarType() != MVT::i1) { + // Compare the input vector elements to zero & use to count trailing zeros + SDValue AllZero = DAG.getConstant(0, DL, OpVT); + OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + OpVT.getVectorElementCount()); + Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE); + } + + // Find the smallest "sensible" element type to use for the expansion. + ConstantRange CR( + APInt(64, OpVT.getVectorElementCount().getKnownMinValue())); + if (OpVT.isScalableVT()) + CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64)); + + // If the zero-is-poison flag is set, we can assume the upper limit + // of the result is VF-1. + if (cast(getValue(I.getOperand(1)))->getZExtValue() != 0) + CR = CR.subtract(APInt(64, 1)); + + unsigned EltWidth = I.getType()->getScalarSizeInBits(); + EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits()); + EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8); + + MVT NewEltTy = MVT::getIntegerVT(EltWidth); + + // Create the new vector type & get the vector length + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy, + OpVT.getVectorElementCount()); + + SDValue VL = + DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount()); + + SDValue StepVec = DAG.getStepVector(DL, NewVT); + SDValue SplatVL = DAG.getSplat(NewVT, DL, VL); + SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec); + SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op); + SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext); + SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And); + SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max); + + EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, DL, RetTy, Sub); + + setValue(&I, Ret); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -335,6 +335,8 @@ PTEST_ANY, PTRUE, + CTTZ_ELTS, + BITREVERSE_MERGE_PASSTHRU, BSWAP_MERGE_PASSTHRU, REVH_MERGE_PASSTHRU, @@ -927,6 +929,8 @@ bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; + bool shouldExpandCttzElements(EVT VT) const override; + /// If a change in streaming mode is required on entry to/return from a /// function call it emits and returns the corresponding SMSTART or SMSTOP node. /// \p Entry tells whether this is before/after the Call, which is necessary Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1789,6 +1789,13 @@ return false; } +bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { + if (!Subtarget->hasSVE() || VT != MVT::nxv16i1) + return true; + + return false; +} + void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT, bool StreamingSVE) { assert(VT.isFixedLengthVector() && "Expected fixed length vector type!"); @@ -2632,6 +2639,7 @@ MAKE_CASE(AArch64ISD::MRRS) MAKE_CASE(AArch64ISD::MSRR) MAKE_CASE(AArch64ISD::RSHRNB_I) + MAKE_CASE(AArch64ISD::CTTZ_ELTS) } #undef MAKE_CASE return nullptr; @@ -5336,6 +5344,16 @@ } return SDValue(); } + case Intrinsic::experimental_cttz_elts: { + EVT Ty = Op.getValueType(); + SDValue NewCttzElts = + DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1)); + + if (Ty == MVT::i64) + return NewCttzElts; + + return DAG.getZExtOrTrunc(NewCttzElts, dl, Ty); + } } } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -834,6 +834,9 @@ [(AArch64rshrnb node:$rs, node:$i), (int_aarch64_sve_rshrnb node:$rs, node:$i)]>; +def AArch64CttzElts : SDNode<"AArch64ISD::CTTZ_ELTS", SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisVec<1>]>, []>; + // Match add node and also treat an 'or' node is as an 'add' if the or'ed operands // have no common bits. def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs), Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1964,6 +1964,11 @@ defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>; defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>; defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>; + + def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)), + (i64 (!cast(CNTP_XPP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)), + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>; } defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>; @@ -2049,6 +2054,17 @@ defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; + def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))), + (i64 (!cast(INCP_XP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)), + GPR64:$Op1))>; + + def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))), + (i32 (EXTRACT_SUBREG (i64 (!cast(INCP_XP_B) + (nxv16i1 (!cast(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))), + sub_32))>; + defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>; defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>; defm INDEX_RI : sve_int_index_ri<"index">; Index: llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -0,0 +1,264 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; WITH VSCALE RANGE + +define i64 @ctz_nxv8i1( %a) #0 { +; CHECK-LABEL: ctz_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: inch z0.h +; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( %a, i1 0) + ret i64 %res +} + +define i32 @ctz_nxv32i1( %a) #0 { +; CHECK-LABEL: ctz_nxv32i1: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: punpklo p3.h, p1.b +; CHECK-NEXT: rdvl x9, #2 +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: rdvl x8, #-1 +; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: inch z0.h, all, mul #4 +; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p2.h +; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: add z4.h, z0.h, z2.h +; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: add z2.h, z1.h, z2.h +; CHECK-NEXT: and z3.d, z4.d, z5.d +; CHECK-NEXT: and z1.d, z1.d, z6.d +; CHECK-NEXT: and z2.d, z2.d, z7.d +; CHECK-NEXT: umax z0.h, p2/m, z0.h, z3.h +; CHECK-NEXT: umax z1.h, p2/m, z1.h, z2.h +; CHECK-NEXT: umax z0.h, p2/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p2, z0.h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( %a, i1 0) + ret i32 %res +} + +define i32 @ctz_nxv4i32( %a) #0 { +; CHECK-LABEL: ctz_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: index z1.s, #0, #-1 +; CHECK-NEXT: cntw x9 +; CHECK-NEXT: incw z1.s +; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32( %a, i1 0) + ret i32 %res +} + +; VSCALE RANGE, ZERO IS POISON + +define i64 @vscale_4096( %a) #1 { +; CHECK-LABEL: vscale_4096: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: neg x8, x9 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: incw z0.s, all, mul #4 +; CHECK-NEXT: add z1.s, z0.s, z1.s +; CHECK-NEXT: add z5.s, z0.s, z2.s +; CHECK-NEXT: punpkhi p2.h, p1.b +; CHECK-NEXT: punpkhi p3.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: add z2.s, z1.s, z2.s +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p2.s +; CHECK-NEXT: mov z4.s, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: and z2.d, z2.d, z4.d +; CHECK-NEXT: and z3.d, z5.d, z6.d +; CHECK-NEXT: and z0.d, z0.d, z7.d +; CHECK-NEXT: umax z1.s, p2/m, z1.s, z2.s +; CHECK-NEXT: umax z0.s, p2/m, z0.s, z3.s +; CHECK-NEXT: umax z0.s, p2/m, z0.s, z1.s +; CHECK-NEXT: umaxv s0, p2, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8( %a, i1 0) + ret i64 %res +} + +define i64 @vscale_4096_poison( %a) #1 { +; CHECK-LABEL: vscale_4096_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: rdvl x9, #1 +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: inch z0.h, all, mul #2 +; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p0, z0.h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and x0, x8, #0xffff +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8( %a, i1 1) + ret i64 %res +} + +; NO VSCALE RANGE + +define i32 @ctz_nxv8i1_no_range( %a) { +; CHECK-LABEL: ctz_nxv8i1_no_range: +; CHECK: // %bb.0: +; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: cnth x9 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: incw z0.s, all, mul #2 +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: add z1.s, z0.s, z1.s +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: umaxv s0, p0, z0.s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1( %a, i1 0) + ret i32 %res +} + +; MATCH WITH BRKB + CNTP + +define i32 @ctz_nxv16i1( %pg, %a) { +; CHECK-LABEL: ctz_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %a, i1 0) + ret i32 %res +} + +define i32 @ctz_nxv16i1_poison( %pg, %a) { +; CHECK-LABEL: ctz_nxv16i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %a, i1 1) + ret i32 %res +} + +define i32 @ctz_and_nxv16i1( %pg, %a, %b) { +; CHECK-LABEL: ctz_and_nxv16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: brkb p0.b, p1/z, p0.b +; CHECK-NEXT: cntp x0, p0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %cmp = icmp ne %a, %b + %select = select %pg, %cmp, zeroinitializer + %and = and %pg, %select + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %and, i1 0) + ret i32 %res +} + +define i64 @add_i64_ctz_nxv16i1_poison( %pg, %a, i64 %b) { +; CHECK-LABEL: add_i64_ctz_nxv16i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: incp x0, p0.b +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( %a, i1 1) + %add = add i64 %res, %b + ret i64 %add +} + +define i32 @add_i32_ctz_nxv16i1_poison( %pg, %a, i32 %b) { +; CHECK-LABEL: add_i32_ctz_nxv16i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: incp x0, p0.b +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( %a, i1 1) + %trunc = trunc i64 %res to i32 + %add = add i32 %trunc, %b + ret i32 %add +} + +declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(, i1) +declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(, i1) +declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(, i1) +declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(, i1) +declare i32 @llvm.experimental.cttz.elts.i32.nxv32i1(, i1) +declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(, i1) + +declare i64 @llvm.experimental.cttz.elts.i64.nxv16i8(, i1) + +attributes #0 = { vscale_range(1,16) } +attributes #1 = { vscale_range(1,4096) } Index: llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll @@ -0,0 +1,119 @@ +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +; FIXED WIDTH + +define i8 @ctz_v8i1(<8 x i1> %a) { +; CHECK-LABEL: .LCPI0_0: +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i1 0) + ret i8 %res +} + +define i32 @ctz_v16i1(<16 x i1> %a) { +; CHECK-LABEL: .LCPI1_0: +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 15 +; CHECK-NEXT: .byte 14 +; CHECK-NEXT: .byte 13 +; CHECK-NEXT: .byte 12 +; CHECK-NEXT: .byte 11 +; CHECK-NEXT: .byte 10 +; CHECK-NEXT: .byte 9 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v16i1: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: mov w9, #16 // =0x10 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umaxv b0, v0.16b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0) + ret i32 %res +} + +define i16 @ctz_v4i32(<4 x i32> %a) { +; CHECK-LABEL: .LCPI2_0: +; CHECK-NEXT: .hword 4 +; CHECK-NEXT: .hword 3 +; CHECK-NEXT: .hword 2 +; CHECK-NEXT: .hword 1 +; CHECK-LABEL: ctz_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv h0, v0.4h +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w8, w9, w8 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) + ret i16 %res +} + +; ZERO IS POISON + +define i8 @ctz_v8i1_poison(<8 x i1> %a) { +; CHECK-LABEL: .LCPI3_0: +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v8i1_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.8b, v0.8b, #7 +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umaxv b0, v0.8b +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i1 1) + ret i8 %res +} + +declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1>, i1) +declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1) +declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1) Index: llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -0,0 +1,175 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=riscv32 -mattr=+v < %s | FileCheck %s -check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v < %s | FileCheck %s -check-prefix=RV64 + +; WITH VSCALE RANGE + +define i32 @ctz_nxv4i32( %a) #0 { +; RV32-LABEL: ctz_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: vid.v v11 +; RV32-NEXT: li a1, -1 +; RV32-NEXT: vmadd.vx v11, a1, v10 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmsne.vi v0, v8, 0 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v8, v8, -1, v0 +; RV32-NEXT: vand.vv v8, v11, v8 +; RV32-NEXT: vredmaxu.vs v8, v8, v8 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: lui a1, 16 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vid.v v11 +; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmadd.vx v11, a1, v10 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vand.vv v8, v11, v8 +; RV64-NEXT: vredmaxu.vs v8, v8, v8 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: lui a1, 16 +; RV64-NEXT: addiw a1, a1, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32( %a, i1 0) + ret i32 %res +} + +; NO VSCALE RANGE + +define i64 @ctz_nxv8i1_no_range( %a) { +; RV32-LABEL: ctz_nxv8i1_no_range: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -48 +; RV32-NEXT: .cfi_def_cfa_offset 48 +; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; RV32-NEXT: addi a0, sp, 32 +; RV32-NEXT: vs2r.v v8, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: li a2, 8 +; RV32-NEXT: li a1, 0 +; RV32-NEXT: li a3, 0 +; RV32-NEXT: call __muldi3@plt +; RV32-NEXT: sw a1, 24(sp) +; RV32-NEXT: sw a0, 20(sp) +; RV32-NEXT: addi a2, sp, 20 +; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a2), zero +; RV32-NEXT: vid.v v8 +; RV32-NEXT: li a2, -1 +; RV32-NEXT: vmadd.vx v8, a2, v16 +; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV32-NEXT: addi a2, sp, 32 +; RV32-NEXT: vl2r.v v16, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vmsne.vi v0, v16, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vmerge.vim v16, v16, -1, v0 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vredmaxu.vs v8, v8, v8 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: sltu a3, a0, a2 +; RV32-NEXT: li a4, 32 +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32-NEXT: vsrl.vx v8, v8, a4 +; RV32-NEXT: vmv.x.s a4, v8 +; RV32-NEXT: sub a1, a1, a4 +; RV32-NEXT: sub a1, a1, a3 +; RV32-NEXT: sub a0, a0, a2 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: add sp, sp, a2 +; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 48 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_nxv8i1_no_range: +; RV64: # %bb.0: +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.x v24, a0 +; RV64-NEXT: vid.v v16 +; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmadd.vx v16, a1, v24 +; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RV64-NEXT: vmsne.vi v0, v8, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: vredmaxu.vs v8, v8, v8 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: ret + %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i16( %a, i1 0) + ret i64 %res +} + +define i32 @ctz_nxv16i1( %pg, %a) { +; RV32-LABEL: ctz_nxv16i1: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vid.v v16 +; RV32-NEXT: li a1, -1 +; RV32-NEXT: vmadd.vx v16, a1, v8 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: vredmaxu.vs v8, v8, v8 +; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_nxv16i1: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v0, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vid.v v16 +; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmadd.vx v16, a1, v8 +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vim v8, v8, -1, v0 +; RV64-NEXT: vand.vv v8, v16, v8 +; RV64-NEXT: vredmaxu.vs v8, v8, v8 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: subw a0, a0, a1 +; RV64-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1( %a, i1 0) + ret i32 %res +} + +declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(, i1) +declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(, i1) +declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(, i1) + +attributes #0 = { vscale_range(2,1024) } Index: llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -check-prefix=RV32 +; RUN: llc -mtriple=riscv64 < %s | FileCheck %s -check-prefix=RV64 + +; FIXED WIDTH + +define i16 @ctz_v4i32(<4 x i32> %a) { +; RV32-LABEL: ctz_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: lw a3, 0(a0) +; RV32-NEXT: lw a1, 4(a0) +; RV32-NEXT: lw a2, 12(a0) +; RV32-NEXT: lw a4, 8(a0) +; RV32-NEXT: seqz a0, a3 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: andi a0, a0, 4 +; RV32-NEXT: seqz a3, a4 +; RV32-NEXT: addi a3, a3, -1 +; RV32-NEXT: andi a3, a3, 2 +; RV32-NEXT: bltu a3, a0, .LBB0_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a3 +; RV32-NEXT: .LBB0_2: +; RV32-NEXT: snez a2, a2 +; RV32-NEXT: seqz a1, a1 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: andi a1, a1, 3 +; RV32-NEXT: bltu a2, a1, .LBB0_4 +; RV32-NEXT: # %bb.3: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: .LBB0_4: +; RV32-NEXT: bltu a1, a0, .LBB0_6 +; RV32-NEXT: # %bb.5: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB0_6: +; RV32-NEXT: li a1, 4 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: andi a0, a1, 255 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: lw a3, 0(a0) +; RV64-NEXT: lw a1, 8(a0) +; RV64-NEXT: lw a2, 24(a0) +; RV64-NEXT: lw a4, 16(a0) +; RV64-NEXT: seqz a0, a3 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: andi a0, a0, 4 +; RV64-NEXT: seqz a3, a4 +; RV64-NEXT: addi a3, a3, -1 +; RV64-NEXT: andi a3, a3, 2 +; RV64-NEXT: bltu a3, a0, .LBB0_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a3 +; RV64-NEXT: .LBB0_2: +; RV64-NEXT: snez a2, a2 +; RV64-NEXT: seqz a1, a1 +; RV64-NEXT: addi a1, a1, -1 +; RV64-NEXT: andi a1, a1, 3 +; RV64-NEXT: bltu a2, a1, .LBB0_4 +; RV64-NEXT: # %bb.3: +; RV64-NEXT: mv a1, a2 +; RV64-NEXT: .LBB0_4: +; RV64-NEXT: bltu a1, a0, .LBB0_6 +; RV64-NEXT: # %bb.5: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB0_6: +; RV64-NEXT: li a1, 4 +; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: andi a0, a1, 255 +; RV64-NEXT: ret + %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) + ret i16 %res +} + +; ZERO IS POISON + +define i32 @ctz_v2i1_poison(<2 x i1> %a) { +; RV32-LABEL: ctz_v2i1_poison: +; RV32: # %bb.0: +; RV32-NEXT: andi a1, a1, 1 +; RV32-NEXT: slli a0, a0, 31 +; RV32-NEXT: srai a0, a0, 31 +; RV32-NEXT: andi a0, a0, 2 +; RV32-NEXT: bltu a1, a0, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: andi a0, a1, 255 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_v2i1_poison: +; RV64: # %bb.0: +; RV64-NEXT: andi a1, a1, 1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: andi a0, a0, 2 +; RV64-NEXT: bltu a1, a0, .LBB1_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: li a1, 2 +; RV64-NEXT: subw a1, a1, a0 +; RV64-NEXT: andi a0, a1, 255 +; RV64-NEXT: ret + %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1) + ret i32 %res +} + +declare i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1>, i1) +declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1) Index: llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll @@ -0,0 +1,134 @@ +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s + +define i8 @ctz_v8i16(<8 x i16> %a) { +; CHECK-LABEL: .LCPI0_0: +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqw %xmm0, %xmm1 +; CHECK-NEXT: packsswb %xmm1, %xmm1 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %cl, %al +; CHECK-NEXT: cmoval %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movb $8, %al +; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: retq + %res = call i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16> %a, i1 0) + ret i8 %res +} + +define i16 @ctz_v4i32(<4 x i32> %a) { +; CHECK-LABEL: .LCPI1_0: +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-NEXT: packssdw %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm0 +; CHECK-NEXT: packsswb %xmm0, %xmm0 +; CHECK-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: shrl $8, %ecx +; CHECK-NEXT: cmpb %cl, %al +; CHECK-NEXT: cmoval %eax, %ecx +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shrl $16, %edx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmoval %ecx, %edx +; CHECK-NEXT: shrl $24, %eax +; CHECK-NEXT: cmpb %al, %dl +; CHECK-NEXT: cmoval %edx, %eax +; CHECK-NEXT: movb $4, %cl +; CHECK-NEXT: subb %al, %cl +; CHECK-NEXT: movzbl %cl, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq + %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) + ret i16 %res +} + +; ZERO IS POISON + +define i8 @ctz_v8i16_poison(<8 x i16> %a) { +; CHECK-LABEL: .LCPI2_0: +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 7 +; CHECK-NEXT: .byte 6 +; CHECK-NEXT: .byte 5 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 1 +; CHECK-LABEL: ctz_v8i16_poison: +; CHECK: # %bb.0: +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pcmpeqw %xmm0, %xmm1 +; CHECK-NEXT: packsswb %xmm1, %xmm1 +; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: cmpb %cl, %al +; CHECK-NEXT: cmoval %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: cmpb %dl, %cl +; CHECK-NEXT: cmovbel %edx, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: cmpb %al, %cl +; CHECK-NEXT: cmovbel %eax, %ecx +; CHECK-NEXT: movb $8, %al +; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: retq + %res = call i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16> %a, i1 1) + ret i8 %res +} + +declare i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16>, i1) +declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)