diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7083,8 +7083,8 @@ SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); - return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, - DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ); + return DAG.getSelect(dl, VT, SrcIsZero, + DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ); } // Only expand vector types if we have the appropriate vector bit operations. @@ -7132,8 +7132,8 @@ SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); - return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, - DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ); + return DAG.getSelect(dl, VT, SrcIsZero, + DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ); } // Only expand vector types if we have the appropriate vector bit operations. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -630,6 +630,18 @@ setLoadExtAction(ISD::SEXTLOAD, OtherVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, OtherVT, VT, Expand); } + + // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point + // type that can represent the value exactly. + if (VT.getVectorElementType() != MVT::i64) { + MVT FloatEltVT = + VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32; + EVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()); + if (isTypeLegal(FloatVT)) { + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } + } } // Expand various CCs to best match the RVV ISA, which natively supports UNE @@ -848,6 +860,19 @@ for (unsigned VPOpc : IntegerVPOps) setOperationAction(VPOpc, VT, Custom); + + // Lower CTLZ_ZERO_UNDEF and CTTZ_ZERO_UNDEF if we have a floating point + // type that can represent the value exactly. + if (VT.getVectorElementType() != MVT::i64) { + MVT FloatEltVT = + VT.getVectorElementType() == MVT::i32 ? MVT::f64 : MVT::f32; + EVT FloatVT = + MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()); + if (isTypeLegal(FloatVT)) { + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } + } } for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) { @@ -2323,6 +2348,57 @@ return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL); } +// Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting +// the exponent. +static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + unsigned EltSize = VT.getScalarSizeInBits(); + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // We need a FP type that can represent the value. + // TODO: Use f16 for i8 when possible? + MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32; + MVT FloatVT = MVT::getVectorVT(FloatEltVT, VT.getVectorElementCount()); + + // Legal types should have been checked in the RISCVTargetLowering + // constructor. + // TODO: Splitting may make sense in some cases. + assert(DAG.getTargetLoweringInfo().isTypeLegal(FloatVT) && + "Expected legal float type!"); + + // For CTTZ_ZERO_UNDEF, we need to extract the lowest set bit using X & -X. + // The trailing zero count is equal to log2 of this single bit value. + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { + SDValue Neg = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); + Src = DAG.getNode(ISD::AND, DL, VT, Src, Neg); + } + + // We have a legal FP type, convert to it. + SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src); + // Bitcast to integer and shift the exponent to the LSB. + EVT IntVT = FloatVT.changeVectorElementTypeToInteger(); + SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal); + unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23; + SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast, + DAG.getConstant(ShiftAmt, DL, IntVT)); + // Truncate back to original type to allow vnsrl. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift); + // The exponent contains log2 of the value in biased form. + unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127; + + // For trailing zeros, we just need to subtract the bias. + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) + return DAG.getNode(ISD::SUB, DL, VT, Trunc, + DAG.getConstant(ExponentBias, DL, VT)); + + // For leading zeros, we need to remove the bias and convert from log2 to + // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)). + unsigned Adjust = ExponentBias + (EltSize - 1); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc); +} + // While RVV has alignment restrictions, we should always be able to load as a // legal equivalently-sized byte-typed vector instead. This method is // responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If @@ -2941,6 +3017,9 @@ return lowerToScalableOp(Op, DAG, RISCVISD::FMAXNUM_VL); case ISD::ABS: return lowerABS(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: + return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG); case ISD::VSELECT: return lowerFixedLengthVectorSelectToRVV(Op, DAG); case ISD::FCOPYSIGN: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctlz-sdnode.ll @@ -1,147 +1,429 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32I +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64I +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32D +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64D define @ctlz_nxv1i8( %va) { -; CHECK-LABEL: ctlz_nxv1i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: ctlz_nxv1i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv1i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv1i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v9, v9 +; RV32D-NEXT: vsrl.vi v9, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: vrsub.vx v8, v9, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv1i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v9, v9 +; RV64D-NEXT: vsrl.vi v9, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: vrsub.vx v8, v9, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv1i8( %va, i1 false) ret %a } declare @llvm.ctlz.nxv1i8(, i1) define @ctlz_nxv2i8( %va) { -; CHECK-LABEL: ctlz_nxv2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: ctlz_nxv2i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv2i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv2i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v9, v9 +; RV32D-NEXT: vsrl.vi v9, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: vrsub.vx v8, v9, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv2i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v9, v9 +; RV64D-NEXT: vsrl.vi v9, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: vrsub.vx v8, v9, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv2i8( %va, i1 false) ret %a } declare @llvm.ctlz.nxv2i8(, i1) define @ctlz_nxv4i8( %va) { -; CHECK-LABEL: ctlz_nxv4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: ctlz_nxv4i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv4i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv4i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32D-NEXT: vzext.vf4 v10, v8 +; RV32D-NEXT: vfcvt.f.xu.v v10, v10 +; RV32D-NEXT: vsrl.vi v10, v10, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v10, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: vrsub.vx v8, v9, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv4i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64D-NEXT: vzext.vf4 v10, v8 +; RV64D-NEXT: vfcvt.f.xu.v v10, v10 +; RV64D-NEXT: vsrl.vi v10, v10, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v10, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: vrsub.vx v8, v9, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv4i8( %va, i1 false) ret %a } declare @llvm.ctlz.nxv4i8(, i1) define @ctlz_nxv8i8( %va) { -; CHECK-LABEL: ctlz_nxv8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: ctlz_nxv8i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv8i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv8i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32D-NEXT: vzext.vf4 v12, v8 +; RV32D-NEXT: vfcvt.f.xu.v v12, v12 +; RV32D-NEXT: vsrl.vi v12, v12, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v12, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v10, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: vrsub.vx v8, v9, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv8i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64D-NEXT: vzext.vf4 v12, v8 +; RV64D-NEXT: vfcvt.f.xu.v v12, v12 +; RV64D-NEXT: vsrl.vi v12, v12, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v12, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v10, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: vrsub.vx v8, v9, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv8i8( %va, i1 false) ret %a } declare @llvm.ctlz.nxv8i8(, i1) define @ctlz_nxv16i8( %va) { -; CHECK-LABEL: ctlz_nxv16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: ctlz_nxv16i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv16i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv16i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; RV32D-NEXT: vzext.vf4 v16, v8 +; RV32D-NEXT: vfcvt.f.xu.v v16, v16 +; RV32D-NEXT: vsrl.vi v16, v16, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v16, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v12, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: vrsub.vx v8, v10, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv16i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; RV64D-NEXT: vzext.vf4 v16, v8 +; RV64D-NEXT: vfcvt.f.xu.v v16, v16 +; RV64D-NEXT: vsrl.vi v16, v16, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v16, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v12, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: vrsub.vx v8, v10, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv16i8( %va, i1 false) ret %a } @@ -206,31 +488,536 @@ declare @llvm.ctlz.nxv64i8(, i1) define @ctlz_nxv1i16( %va) { -; RV32-LABEL: ctlz_nxv1i16: +; RV32I-LABEL: ctlz_nxv1i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv1i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv1i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vsrl.vi v9, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv1i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vsrl.vi v9, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v9, v9, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv1i16( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv1i16(, i1) + +define @ctlz_nxv2i16( %va) { +; RV32I-LABEL: ctlz_nxv2i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv2i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv2i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vsrl.vi v9, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv2i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vsrl.vi v9, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v9, v9, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv2i16( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv2i16(, i1) + +define @ctlz_nxv4i16( %va) { +; RV32I-LABEL: ctlz_nxv4i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv4i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv4i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vsrl.vi v10, v10, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v10, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv4i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vsrl.vi v10, v10, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v10, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v9, v9, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv4i16( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv4i16(, i1) + +define @ctlz_nxv8i16( %va) { +; RV32I-LABEL: ctlz_nxv8i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv8i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv8i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vsrl.vi v12, v12, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v12, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v10, v10, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v10, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv8i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vsrl.vi v12, v12, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v12, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v10, v10, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v10, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv8i16( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv8i16(, i1) + +define @ctlz_nxv16i16( %va) { +; RV32I-LABEL: ctlz_nxv16i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv16i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv16i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV32D-NEXT: vsrl.vi v16, v16, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v16, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v12, v12, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v12, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv16i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV64D-NEXT: vsrl.vi v16, v16, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v16, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v12, v12, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v12, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv16i16( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv16i16(, i1) + +define @ctlz_nxv32i16( %va) { +; RV32-LABEL: ctlz_nxv32i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vand.vx v8, v8, a0 @@ -239,31 +1026,31 @@ ; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_nxv1i16: +; RV64-LABEL: ctlz_nxv32i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 @@ -271,86 +1058,546 @@ ; RV64-NEXT: vmul.vx v8, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv1i16( %va, i1 false) - ret %a + %a = call @llvm.ctlz.nxv32i16( %va, i1 false) + ret %a } -declare @llvm.ctlz.nxv1i16(, i1) +declare @llvm.ctlz.nxv32i16(, i1) -define @ctlz_nxv2i16( %va) { -; RV32-LABEL: ctlz_nxv2i16: +define @ctlz_nxv1i32( %va) { +; RV32I-LABEL: ctlz_nxv1i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv1i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv1i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32D-NEXT: vsrl.vx v9, v9, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv1i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64D-NEXT: vsrl.vx v9, v9, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v9, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v9, v9, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv1i32( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv1i32(, i1) + +define @ctlz_nxv2i32( %va) { +; RV32I-LABEL: ctlz_nxv2i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv2i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv2i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32D-NEXT: vsrl.vx v10, v10, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v10, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv2i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64D-NEXT: vsrl.vx v10, v10, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v9, v10, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v9, v9, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv2i32( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv2i32(, i1) + +define @ctlz_nxv4i32( %va) { +; RV32I-LABEL: ctlz_nxv4i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv4i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv4i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32D-NEXT: vsrl.vx v12, v12, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v12, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v10, v10, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v10, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv4i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64D-NEXT: vsrl.vx v12, v12, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v12, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v10, v10, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v10, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv4i32( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv4i32(, i1) + +define @ctlz_nxv8i32( %va) { +; RV32I-LABEL: ctlz_nxv8i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_nxv8i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_nxv8i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32D-NEXT: vsrl.vx v16, v16, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v16, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v12, v12, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v12, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_nxv8i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64D-NEXT: vsrl.vx v16, v16, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v16, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v12, v12, a0 +; RV64D-NEXT: vmseq.vi v0, v8, 0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v12, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv8i32( %va, i1 false) + ret %a +} +declare @llvm.ctlz.nxv8i32(, i1) + +define @ctlz_nxv16i32( %va) { +; RV32-LABEL: ctlz_nxv16i32: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 ; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_nxv2i16: +; RV64-LABEL: ctlz_nxv16i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 +; RV64-NEXT: lui a0, 4112 +; RV64-NEXT: addiw a0, a0, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv2i16( %va, i1 false) - ret %a + %a = call @llvm.ctlz.nxv16i32( %va, i1 false) + ret %a } -declare @llvm.ctlz.nxv2i16(, i1) +declare @llvm.ctlz.nxv16i32(, i1) -define @ctlz_nxv4i16( %va) { -; RV32-LABEL: ctlz_nxv4i16: +define @ctlz_nxv1i64( %va) { +; RV32-LABEL: ctlz_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: addi a0, a0, 1365 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 209715 +; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; RV32-NEXT: vsrl.vi v9, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 2 @@ -359,31 +1606,39 @@ ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsrl.vi v9, v8, 8 ; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsrl.vi v9, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: addi a0, zero, 32 +; RV32-NEXT: vsrl.vx v9, v8, a0 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsrl.vi v11, v8, 1 +; RV32-NEXT: vand.vv v9, v11, v9 ; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vand.vv v9, v8, v10 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vand.vv v8, v8, v10 ; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vsrl.vi v11, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v11 +; RV32-NEXT: vand.vv v8, v8, v9 +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: addi a0, zero, 56 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_nxv4i16: +; RV64-LABEL: ctlz_nxv1i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu ; RV64-NEXT: vsrl.vi v9, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 2 @@ -392,36 +1647,83 @@ ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vsrl.vi v9, v8, 8 ; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsrl.vi v9, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: addi a0, zero, 32 +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vxor.vi v8, v8, -1 ; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 +; RV64-NEXT: lui a0, 21845 ; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 ; RV64-NEXT: vand.vx v9, v9, a0 ; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 +; RV64-NEXT: lui a0, 13107 ; RV64-NEXT: addiw a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 ; RV64-NEXT: vand.vx v9, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: vadd.vv v8, v9, v8 ; RV64-NEXT: vsrl.vi v9, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 +; RV64-NEXT: lui a0, 3855 +; RV64-NEXT: addiw a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 +; RV64-NEXT: lui a0, 4112 +; RV64-NEXT: addiw a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: addi a0, zero, 56 +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv4i16( %va, i1 false) - ret %a + %a = call @llvm.ctlz.nxv1i64( %va, i1 false) + ret %a } -declare @llvm.ctlz.nxv4i16(, i1) +declare @llvm.ctlz.nxv1i64(, i1) -define @ctlz_nxv8i16( %va) { -; RV32-LABEL: ctlz_nxv8i16: +define @ctlz_nxv2i64( %va) { +; RV32-LABEL: ctlz_nxv2i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: addi a0, a0, 1365 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 209715 +; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; RV32-NEXT: vsrl.vi v10, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 2 @@ -430,31 +1732,39 @@ ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsrl.vi v10, v8, 8 ; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsrl.vi v10, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: addi a0, zero, 32 +; RV32-NEXT: vsrl.vx v10, v8, a0 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsrl.vi v14, v8, 1 +; RV32-NEXT: vand.vv v10, v14, v10 ; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vand.vv v10, v8, v12 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vand.vv v8, v8, v12 ; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vsrl.vi v14, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v14 +; RV32-NEXT: vand.vv v8, v8, v10 +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: addi a0, zero, 56 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_nxv8i16: +; RV64-LABEL: ctlz_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu ; RV64-NEXT: vsrl.vi v10, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 2 @@ -463,36 +1773,83 @@ ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vsrl.vi v10, v8, 8 ; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vi v10, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: addi a0, zero, 32 +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vxor.vi v8, v8, -1 ; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 5 +; RV64-NEXT: lui a0, 21845 ; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 ; RV64-NEXT: vand.vx v10, v10, a0 ; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 3 +; RV64-NEXT: lui a0, 13107 ; RV64-NEXT: addiw a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 ; RV64-NEXT: vand.vx v10, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: vadd.vv v8, v10, v8 ; RV64-NEXT: vsrl.vi v10, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 +; RV64-NEXT: lui a0, 3855 +; RV64-NEXT: addiw a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 +; RV64-NEXT: lui a0, 4112 +; RV64-NEXT: addiw a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: addi a0, zero, 56 +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv8i16( %va, i1 false) - ret %a + %a = call @llvm.ctlz.nxv2i64( %va, i1 false) + ret %a } -declare @llvm.ctlz.nxv8i16(, i1) +declare @llvm.ctlz.nxv2i64(, i1) -define @ctlz_nxv16i16( %va) { -; RV32-LABEL: ctlz_nxv16i16: +define @ctlz_nxv4i64( %va) { +; RV32-LABEL: ctlz_nxv4i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: addi a0, a0, 1365 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 209715 +; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; RV32-NEXT: vsrl.vi v12, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 2 @@ -501,31 +1858,39 @@ ; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: vsrl.vi v12, v8, 8 ; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsrl.vi v12, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: addi a0, zero, 32 +; RV32-NEXT: vsrl.vx v12, v8, a0 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v12, (a0), zero ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsrl.vi v20, v8, 1 +; RV32-NEXT: vand.vv v12, v20, v12 ; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vand.vv v12, v8, v16 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vand.vv v8, v8, v16 ; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: vsrl.vi v20, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v20 +; RV32-NEXT: vand.vv v8, v8, v12 +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: addi a0, zero, 56 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_nxv16i16: +; RV64-LABEL: ctlz_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu ; RV64-NEXT: vsrl.vi v12, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 2 @@ -534,36 +1899,83 @@ ; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsrl.vi v12, v8, 8 ; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vsrl.vi v12, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: addi a0, zero, 32 +; RV64-NEXT: vsrl.vx v12, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vxor.vi v8, v8, -1 ; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 5 +; RV64-NEXT: lui a0, 21845 ; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 ; RV64-NEXT: vand.vx v12, v12, a0 ; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 3 +; RV64-NEXT: lui a0, 13107 ; RV64-NEXT: addiw a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 ; RV64-NEXT: vand.vx v12, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: vadd.vv v8, v12, v8 ; RV64-NEXT: vsrl.vi v12, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 +; RV64-NEXT: lui a0, 3855 +; RV64-NEXT: addiw a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 +; RV64-NEXT: lui a0, 4112 +; RV64-NEXT: addiw a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: addi a0, zero, 56 +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv16i16( %va, i1 false) - ret %a + %a = call @llvm.ctlz.nxv4i64( %va, i1 false) + ret %a } -declare @llvm.ctlz.nxv16i16(, i1) +declare @llvm.ctlz.nxv4i64(, i1) -define @ctlz_nxv32i16( %va) { -; RV32-LABEL: ctlz_nxv32i16: +define @ctlz_nxv8i64( %va) { +; RV32-LABEL: ctlz_nxv8i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: addi a0, a0, 1365 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 209715 +; RV32-NEXT: addi a0, a0, 819 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 2 @@ -572,31 +1984,39 @@ ; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vsrl.vi v16, v8, 8 ; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 16 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: addi a0, zero, 32 +; RV32-NEXT: vsrl.vx v16, v8, a0 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v16, (a0), zero ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v24, (a0), zero +; RV32-NEXT: vsrl.vi v0, v8, 1 +; RV32-NEXT: vand.vv v16, v0, v16 ; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vand.vv v16, v8, v24 ; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v16, (a0), zero +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v24, (a0), zero +; RV32-NEXT: vsrl.vi v0, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v8, v8, v24 +; RV32-NEXT: addi a0, zero, 56 +; RV32-NEXT: vsrl.vx v8, v8, a0 +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_nxv32i16: +; RV64-LABEL: ctlz_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu ; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 2 @@ -605,1837 +2025,1461 @@ ; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vsrl.vi v16, v8, 8 ; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: addi a0, zero, 32 +; RV64-NEXT: vsrl.vx v16, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vxor.vi v8, v8, -1 ; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 5 +; RV64-NEXT: lui a0, 21845 ; RV64-NEXT: addiw a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 1365 ; RV64-NEXT: vand.vx v16, v16, a0 ; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 3 +; RV64-NEXT: lui a0, 13107 ; RV64-NEXT: addiw a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 819 ; RV64-NEXT: vand.vx v16, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 ; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vsrl.vi v16, v8, 4 ; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 +; RV64-NEXT: lui a0, 3855 +; RV64-NEXT: addiw a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, 241 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 +; RV64-NEXT: lui a0, 4112 +; RV64-NEXT: addiw a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 +; RV64-NEXT: slli a0, a0, 16 +; RV64-NEXT: addi a0, a0, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: addi a0, zero, 56 +; RV64-NEXT: vsrl.vx v8, v8, a0 ; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv32i16( %va, i1 false) - ret %a + %a = call @llvm.ctlz.nxv8i64( %va, i1 false) + ret %a } -declare @llvm.ctlz.nxv32i16(, i1) +declare @llvm.ctlz.nxv8i64(, i1) -define @ctlz_nxv1i32( %va) { -; RV32-LABEL: ctlz_nxv1i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv1i8( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv1i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv1i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv1i32( %va, i1 false) - ret %a +; RV64I-LABEL: ctlz_zero_undef_nxv1i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv1i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v9 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv1i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v9 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv1i8( %va, i1 true) + ret %a } -declare @llvm.ctlz.nxv1i32(, i1) -define @ctlz_nxv2i32( %va) { -; RV32-LABEL: ctlz_nxv2i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv2i8( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv2i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv2i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv2i32( %va, i1 false) - ret %a +; RV64I-LABEL: ctlz_zero_undef_nxv2i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv2i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v9 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv2i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v9 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv2i8( %va, i1 true) + ret %a } -declare @llvm.ctlz.nxv2i32(, i1) -define @ctlz_nxv4i32( %va) { -; RV32-LABEL: ctlz_nxv4i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv4i8( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv4i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv4i32( %va, i1 false) - ret %a +; RV64I-LABEL: ctlz_zero_undef_nxv4i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv4i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32D-NEXT: vzext.vf4 v10, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v10 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v10, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv4i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64D-NEXT: vzext.vf4 v10, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v10 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v10, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv4i8( %va, i1 true) + ret %a } -declare @llvm.ctlz.nxv4i32(, i1) -define @ctlz_nxv8i32( %va) { -; RV32-LABEL: ctlz_nxv8i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, mu -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv8i8( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv8i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_nxv8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m4, ta, mu -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv8i32( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv8i32(, i1) - -define @ctlz_nxv16i32( %va) { -; RV32-LABEL: ctlz_nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv16i32( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv16i32(, i1) - -define @ctlz_nxv1i64( %va) { -; RV32-LABEL: ctlz_nxv1i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: addi a0, zero, 32 -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 1 -; RV32-NEXT: vand.vv v9, v11, v9 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: vand.vv v9, v8, v10 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v9, (a0), zero -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsrl.vi v11, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v11 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vmul.vv v8, v8, v10 -; RV32-NEXT: addi a0, zero, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_nxv1i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: addi a0, zero, 32 -; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 21845 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 13107 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3855 -; RV64-NEXT: addiw a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv1i64( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv1i64(, i1) - -define @ctlz_nxv2i64( %va) { -; RV32-LABEL: ctlz_nxv2i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: addi a0, zero, 32 -; RV32-NEXT: vsrl.vx v10, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 1 -; RV32-NEXT: vand.vv v10, v14, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vand.vv v10, v8, v12 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsrl.vi v14, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v14 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vmul.vv v8, v8, v12 -; RV32-NEXT: addi a0, zero, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_nxv2i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: addi a0, zero, 32 -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 21845 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 13107 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 3855 -; RV64-NEXT: addiw a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv2i64( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv2i64(, i1) - -define @ctlz_nxv4i64( %va) { -; RV32-LABEL: ctlz_nxv4i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: addi a0, zero, 32 -; RV32-NEXT: vsrl.vx v12, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 1 -; RV32-NEXT: vand.vv v12, v20, v12 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: vand.vv v12, v8, v16 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsrl.vi v20, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v20 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: addi a0, zero, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_nxv4i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: addi a0, zero, 32 -; RV64-NEXT: vsrl.vx v12, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 21845 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 13107 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 3855 -; RV64-NEXT: addiw a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv4i64( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv4i64(, i1) - -define @ctlz_nxv8i64( %va) { -; RV32-LABEL: ctlz_nxv8i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: sw a0, 12(sp) -; RV32-NEXT: sw a0, 8(sp) -; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: addi a0, zero, 32 -; RV32-NEXT: vsrl.vx v16, v8, a0 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 1 -; RV32-NEXT: vand.vv v16, v0, v16 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: vand.vv v16, v8, v24 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v24, (a0), zero -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: addi a0, zero, 56 -; RV32-NEXT: vsrl.vx v8, v8, a0 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_nxv8i64: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: addi a0, zero, 32 -; RV64-NEXT: vsrl.vx v16, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 21845 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 13107 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 3855 -; RV64-NEXT: addiw a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, 241 -; RV64-NEXT: slli a0, a0, 12 -; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: slli a0, a0, 16 -; RV64-NEXT: addi a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 56 -; RV64-NEXT: vsrl.vx v8, v8, a0 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv8i64( %va, i1 false) - ret %a -} -declare @llvm.ctlz.nxv8i64(, i1) - -define @ctlz_zero_undef_nxv1i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv1i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf8, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv1i8( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv2i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv2i8( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv4i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv4i8( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv8i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, mu -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv8i8( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv16i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, mu -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv16i8( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv32i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, mu -; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v12, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vsub.vv v8, v8, v12 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v12, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v12, v8 -; CHECK-NEXT: vsrl.vi v12, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv32i8( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv64i8( %va) { -; CHECK-LABEL: ctlz_zero_undef_nxv64i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu -; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 2 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vsrl.vi v16, v8, 4 -; CHECK-NEXT: vor.vv v8, v8, v16 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vsrl.vi v16, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v16, v16, a0 -; CHECK-NEXT: vsub.vv v8, v8, v16 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v16, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v16, v8 -; CHECK-NEXT: vsrl.vi v16, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret - %a = call @llvm.ctlz.nxv64i8( %va, i1 true) - ret %a +; RV64I-LABEL: ctlz_zero_undef_nxv8i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv8i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32D-NEXT: vzext.vf4 v12, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v12 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v12, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv8i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64D-NEXT: vzext.vf4 v12, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v12 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v12, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv8i8( %va, i1 true) + ret %a } -define @ctlz_zero_undef_nxv1i16( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv1i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv16i8( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv16i8: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv1i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, mf4, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv1i16( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv2i16( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv2i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv16i8: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv2i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv2i16( %va, i1 true) - ret %a -} - -define @ctlz_zero_undef_nxv4i16( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv4i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +; RV32D-LABEL: ctlz_zero_undef_nxv16i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; RV32D-NEXT: vzext.vf4 v16, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v16 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v16, 0 +; RV32D-NEXT: addi a0, zero, 134 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv4i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv4i16( %va, i1 true) - ret %a +; RV64D-LABEL: ctlz_zero_undef_nxv16i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m8, ta, mu +; RV64D-NEXT: vzext.vf4 v16, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v16 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v16, 0 +; RV64D-NEXT: addi a0, zero, 134 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv16i8( %va, i1 true) + ret %a } -define @ctlz_zero_undef_nxv8i16( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv8i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, mu -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_zero_undef_nxv8i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, mu -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv8i16( %va, i1 true) - ret %a +define @ctlz_zero_undef_nxv32i8( %va) { +; CHECK-LABEL: ctlz_zero_undef_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, mu +; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vxor.vi v8, v8, -1 +; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: addi a0, zero, 85 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vsub.vv v8, v8, v12 +; CHECK-NEXT: addi a0, zero, 51 +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v12, v8 +; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vand.vi v8, v8, 15 +; CHECK-NEXT: ret + %a = call @llvm.ctlz.nxv32i8( %va, i1 true) + ret %a } -define @ctlz_zero_undef_nxv16i16( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv16i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m4, ta, mu -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: ctlz_zero_undef_nxv16i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m4, ta, mu -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv16i16( %va, i1 true) - ret %a +define @ctlz_zero_undef_nxv64i8( %va) { +; CHECK-LABEL: ctlz_zero_undef_nxv64i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, mu +; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 2 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: vxor.vi v8, v8, -1 +; CHECK-NEXT: vsrl.vi v16, v8, 1 +; CHECK-NEXT: addi a0, zero, 85 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vsub.vv v8, v8, v16 +; CHECK-NEXT: addi a0, zero, 51 +; CHECK-NEXT: vand.vx v16, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vand.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vsrl.vi v16, v8, 4 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vand.vi v8, v8, 15 +; CHECK-NEXT: ret + %a = call @llvm.ctlz.nxv64i8( %va, i1 true) + ret %a } -define @ctlz_zero_undef_nxv32i16( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv32i16: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v16 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv1i16( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv1i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv32i16: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv32i16( %va, i1 true) - ret %a +; RV64I-LABEL: ctlz_zero_undef_nxv1i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv1i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vsrl.vi v8, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv1i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vsrl.vi v8, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv1i16( %va, i1 true) + ret %a } -define @ctlz_zero_undef_nxv1i32( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv1i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +define @ctlz_zero_undef_nxv2i16( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv2i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv1i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, mf2, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.ctlz.nxv1i32( %va, i1 true) - ret %a +; RV64I-LABEL: ctlz_zero_undef_nxv2i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv2i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vsrl.vi v8, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv2i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vsrl.vi v8, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv2i16( %va, i1 true) + ret %a +} + +define @ctlz_zero_undef_nxv4i16( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv4i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_zero_undef_nxv4i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv4i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vsrl.vi v8, v10, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v8, v10, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv4i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vsrl.vi v8, v10, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v8, v10, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv4i16( %va, i1 true) + ret %a } -define @ctlz_zero_undef_nxv2i32( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv2i32: +define @ctlz_zero_undef_nxv8i16( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv8i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_zero_undef_nxv8i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv8i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vsrl.vi v8, v12, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v8, v12, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv8i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vsrl.vi v8, v12, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v8, v12, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv8i16( %va, i1 true) + ret %a +} + +define @ctlz_zero_undef_nxv16i16( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv16i16: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_zero_undef_nxv16i16: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv16i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV32D-NEXT: vsrl.vi v8, v16, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: addi a0, zero, 142 +; RV32D-NEXT: vrsub.vx v8, v16, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv16i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV64D-NEXT: vsrl.vi v8, v16, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: addi a0, zero, 142 +; RV64D-NEXT: vrsub.vx v8, v16, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv16i16( %va, i1 true) + ret %a +} + +define @ctlz_zero_undef_nxv32i16( %va) { +; RV32-LABEL: ctlz_zero_undef_nxv32i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 +; RV32-NEXT: addi a0, zero, 257 ; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv2i32: +; RV64-LABEL: ctlz_zero_undef_nxv32i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vsetvli a0, zero, e16, m8, ta, mu +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 +; RV64-NEXT: addi a0, zero, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 +; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret + %a = call @llvm.ctlz.nxv32i16( %va, i1 true) + ret %a +} + +define @ctlz_zero_undef_nxv1i32( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv1i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_zero_undef_nxv1i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv1i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32D-NEXT: vsrl.vx v8, v9, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv1i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64D-NEXT: vsrl.vx v8, v9, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.ctlz.nxv1i32( %va, i1 true) + ret %a +} + +define @ctlz_zero_undef_nxv2i32( %va) { +; RV32I-LABEL: ctlz_zero_undef_nxv2i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v9 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: ctlz_zero_undef_nxv2i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v9 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv2i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32D-NEXT: vsrl.vx v8, v10, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v8, v10, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv2i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64D-NEXT: vsrl.vx v8, v10, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v8, v10, a0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv2i32( %va, i1 true) ret %a } define @ctlz_zero_undef_nxv4i32( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv4i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv4i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v10 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv4i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, mu -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv4i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v10 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv4i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32D-NEXT: vsrl.vx v8, v12, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v8, v12, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv4i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64D-NEXT: vsrl.vx v8, v12, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v8, v12, a0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv4i32( %va, i1 true) ret %a } define @ctlz_zero_undef_nxv8i32( %va) { -; RV32-LABEL: ctlz_zero_undef_nxv8i32: -; RV32: # %bb.0: -; RV32-NEXT: vsetvli a0, zero, e32, m4, ta, mu -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 2 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 8 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 16 -; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: ctlz_zero_undef_nxv8i32: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 2 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 8 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 16 +; RV32I-NEXT: vor.vv v8, v8, v12 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: ctlz_zero_undef_nxv8i32: -; RV64: # %bb.0: -; RV64-NEXT: vsetvli a0, zero, e32, m4, ta, mu -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 2 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 8 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 16 -; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: ctlz_zero_undef_nxv8i32: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 2 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 8 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 16 +; RV64I-NEXT: vor.vv v8, v8, v12 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: ctlz_zero_undef_nxv8i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32D-NEXT: vsrl.vx v8, v16, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: addi a0, zero, 1054 +; RV32D-NEXT: vrsub.vx v8, v16, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: ctlz_zero_undef_nxv8i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64D-NEXT: vsrl.vx v8, v16, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: addi a0, zero, 1054 +; RV64D-NEXT: vrsub.vx v8, v16, a0 +; RV64D-NEXT: ret %a = call @llvm.ctlz.nxv8i32( %va, i1 true) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-sdnode.ll @@ -1,132 +1,439 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32I +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64I +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32D +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64D define @cttz_nxv1i8( %va) { -; CHECK-LABEL: cttz_nxv1i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_nxv1i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv1i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv1i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v9 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv1i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v9 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv1i8( %va, i1 false) ret %a } declare @llvm.cttz.nxv1i8(, i1) define @cttz_nxv2i8( %va) { -; CHECK-LABEL: cttz_nxv2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_nxv2i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv2i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv2i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v9 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv2i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v9 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv2i8( %va, i1 false) ret %a } declare @llvm.cttz.nxv2i8(, i1) define @cttz_nxv4i8( %va) { -; CHECK-LABEL: cttz_nxv4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_nxv4i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv4i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv4i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vzext.vf4 v10, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v10 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v10, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv4i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vzext.vf4 v10, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v10 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v10, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv4i8( %va, i1 false) ret %a } declare @llvm.cttz.nxv4i8(, i1) define @cttz_nxv8i8( %va) { -; CHECK-LABEL: cttz_nxv8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_nxv8i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv8i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv8i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vzext.vf4 v12, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v12 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v12, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv8i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vzext.vf4 v12, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v12 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v12, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv8i8( %va, i1 false) ret %a } declare @llvm.cttz.nxv8i8(, i1) define @cttz_nxv16i8( %va) { -; CHECK-LABEL: cttz_nxv16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu -; CHECK-NEXT: vsub.vx v10, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_nxv16i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv16i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv16i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV32D-NEXT: vmv.v.i v10, 0 +; RV32D-NEXT: vmseq.vv v0, v10, v8 +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v10 +; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV32D-NEXT: vzext.vf4 v16, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v16 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v16, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv16i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV64D-NEXT: vmv.v.i v10, 0 +; RV64D-NEXT: vmseq.vv v0, v10, v8 +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v10 +; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV64D-NEXT: vzext.vf4 v16, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v16 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v16, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv16i8( %va, i1 false) ret %a } @@ -185,26 +492,511 @@ declare @llvm.cttz.nxv64i8(, i1) define @cttz_nxv1i16( %va) { -; RV32-LABEL: cttz_nxv1i16: +; RV32I-LABEL: cttz_nxv1i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv1i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv1i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vsrl.vi v8, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv1i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vsrl.vi v8, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv1i16( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv1i16(, i1) + +define @cttz_nxv2i16( %va) { +; RV32I-LABEL: cttz_nxv2i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv2i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv2i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vsrl.vi v8, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv2i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vsrl.vi v8, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv2i16( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv2i16(, i1) + +define @cttz_nxv4i16( %va) { +; RV32I-LABEL: cttz_nxv4i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv4i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv4i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32D-NEXT: vmv.v.i v9, 0 +; RV32D-NEXT: vmseq.vv v0, v9, v8 +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vsrl.vi v8, v10, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v10, a0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv4i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vsrl.vi v8, v10, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v10, a0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv4i16( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv4i16(, i1) + +define @cttz_nxv8i16( %va) { +; RV32I-LABEL: cttz_nxv8i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv8i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv8i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32D-NEXT: vmv.v.i v10, 0 +; RV32D-NEXT: vmseq.vv v0, v10, v8 +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v10 +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vsrl.vi v8, v12, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v12, a0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv8i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64D-NEXT: vmv.v.i v10, 0 +; RV64D-NEXT: vmseq.vv v0, v10, v8 +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v10 +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vsrl.vi v8, v12, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v12, a0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv8i16( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv8i16(, i1) + +define @cttz_nxv16i16( %va) { +; RV32I-LABEL: cttz_nxv16i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv16i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv16i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32D-NEXT: vmv.v.i v12, 0 +; RV32D-NEXT: vmseq.vv v0, v12, v8 +; RV32D-NEXT: vrsub.vi v12, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v12 +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV32D-NEXT: vsrl.vi v8, v16, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v16, a0 +; RV32D-NEXT: addi a0, zero, 16 +; RV32D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv16i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64D-NEXT: vmv.v.i v12, 0 +; RV64D-NEXT: vmseq.vv v0, v12, v8 +; RV64D-NEXT: vrsub.vi v12, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v12 +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV64D-NEXT: vsrl.vi v8, v16, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v16, a0 +; RV64D-NEXT: addi a0, zero, 16 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv16i16( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv16i16(, i1) + +define @cttz_nxv32i16( %va) { +; RV32-LABEL: cttz_nxv32i16: ; RV32: # %bb.0: ; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 +; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV32-NEXT: vsub.vx v16, v8, a0 ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 ; RV32-NEXT: lui a0, 5 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 ; RV32-NEXT: lui a0, 3 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 ; RV32-NEXT: lui a0, 1 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vand.vx v8, v8, a0 @@ -213,26 +1005,26 @@ ; RV32-NEXT: vsrl.vi v8, v8, 8 ; RV32-NEXT: ret ; -; RV64-LABEL: cttz_nxv1i16: +; RV64-LABEL: cttz_nxv32i16: ; RV64: # %bb.0: ; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 +; RV64-NEXT: vsetvli a1, zero, e16, m8, ta, mu +; RV64-NEXT: vsub.vx v16, v8, a0 ; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 ; RV64-NEXT: lui a0, 5 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 ; RV64-NEXT: lui a0, 3 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 ; RV64-NEXT: lui a0, 1 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 @@ -240,637 +1032,480 @@ ; RV64-NEXT: vmul.vx v8, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 8 ; RV64-NEXT: ret - %a = call @llvm.cttz.nxv1i16( %va, i1 false) - ret %a + %a = call @llvm.cttz.nxv32i16( %va, i1 false) + ret %a } -declare @llvm.cttz.nxv1i16(, i1) +declare @llvm.cttz.nxv32i16(, i1) -define @cttz_nxv2i16( %va) { -; RV32-LABEL: cttz_nxv2i16: +define @cttz_nxv1i32( %va) { +; RV32I-LABEL: cttz_nxv1i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv1i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv1i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v9, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v10, v9 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32D-NEXT: vsrl.vx v9, v10, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v9, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv1i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64D-NEXT: vsrl.vx v8, v9, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv1i32( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv1i32(, i1) + +define @cttz_nxv2i32( %va) { +; RV32I-LABEL: cttz_nxv2i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv2i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv2i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v9, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v10, v9 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32D-NEXT: vsrl.vx v10, v10, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v9, v10, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v9, v9, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v9, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv2i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64D-NEXT: vmv.v.i v9, 0 +; RV64D-NEXT: vmseq.vv v0, v9, v8 +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64D-NEXT: vsrl.vx v8, v10, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v10, a0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv2i32( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv2i32(, i1) + +define @cttz_nxv4i32( %va) { +; RV32I-LABEL: cttz_nxv4i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv4i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv4i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v10, v8, v10 +; RV32D-NEXT: vfwcvt.f.xu.v v12, v10 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32D-NEXT: vsrl.vx v12, v12, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v12, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v10, v10, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v10, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv4i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64D-NEXT: vmv.v.i v10, 0 +; RV64D-NEXT: vmseq.vv v0, v10, v8 +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v10 +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64D-NEXT: vsrl.vx v8, v12, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v12, a0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv4i32( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv4i32(, i1) + +define @cttz_nxv8i32( %va) { +; RV32I-LABEL: cttz_nxv8i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_nxv8i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_nxv8i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32D-NEXT: vrsub.vi v12, v8, 0 +; RV32D-NEXT: vand.vv v12, v8, v12 +; RV32D-NEXT: vfwcvt.f.xu.v v16, v12 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32D-NEXT: vsrl.vx v16, v16, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v16, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v12, v12, a0 +; RV32D-NEXT: vmseq.vi v0, v8, 0 +; RV32D-NEXT: addi a0, zero, 32 +; RV32D-NEXT: vmerge.vxm v8, v12, a0, v0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_nxv8i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64D-NEXT: vmv.v.i v12, 0 +; RV64D-NEXT: vmseq.vv v0, v12, v8 +; RV64D-NEXT: vrsub.vi v12, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v12 +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64D-NEXT: vsrl.vx v8, v16, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v16, a0 +; RV64D-NEXT: addi a0, zero, 32 +; RV64D-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv8i32( %va, i1 false) + ret %a +} +declare @llvm.cttz.nxv8i32(, i1) + +define @cttz_nxv16i32( %va) { +; RV32-LABEL: cttz_nxv16i32: ; RV32: # %bb.0: ; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, mf2, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 +; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; RV32-NEXT: vsub.vx v16, v8, a0 ; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsub.vv v8, v8, v16 +; RV32-NEXT: lui a0, 209715 ; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 +; RV32-NEXT: vand.vx v16, v8, a0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 +; RV32-NEXT: vadd.vv v8, v16, v8 +; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v16 +; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 ; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 +; RV32-NEXT: lui a0, 4112 +; RV32-NEXT: addi a0, a0, 257 ; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vsrl.vi v8, v8, 24 ; RV32-NEXT: ret ; -; RV64-LABEL: cttz_nxv2i16: +; RV64-LABEL: cttz_nxv16i32: ; RV64: # %bb.0: ; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, mf2, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 +; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu +; RV64-NEXT: vsub.vx v16, v8, a0 ; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 +; RV64-NEXT: vand.vv v8, v8, v16 +; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: lui a0, 349525 ; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsub.vv v8, v8, v16 +; RV64-NEXT: lui a0, 209715 ; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vand.vx v16, v8, a0 ; RV64-NEXT: vsrl.vi v8, v8, 2 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 +; RV64-NEXT: vadd.vv v8, v16, v8 +; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vadd.vv v8, v8, v16 +; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addiw a0, a0, -241 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 +; RV64-NEXT: lui a0, 4112 +; RV64-NEXT: addiw a0, a0, 257 ; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vsrl.vi v8, v8, 24 ; RV64-NEXT: ret - %a = call @llvm.cttz.nxv2i16( %va, i1 false) - ret %a + %a = call @llvm.cttz.nxv16i32( %va, i1 false) + ret %a } -declare @llvm.cttz.nxv2i16(, i1) +declare @llvm.cttz.nxv16i32(, i1) -define @cttz_nxv4i16( %va) { -; RV32-LABEL: cttz_nxv4i16: +define @cttz_nxv1i64( %va) { +; RV32-LABEL: cttz_nxv1i64: ; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv4i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv4i16( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv4i16(, i1) - -define @cttz_nxv8i16( %va) { -; RV32-LABEL: cttz_nxv8i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; RV32-NEXT: vsub.vx v10, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv8i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; RV64-NEXT: vsub.vx v10, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv8i16( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv8i16(, i1) - -define @cttz_nxv16i16( %va) { -; RV32-LABEL: cttz_nxv16i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m4, ta, mu -; RV32-NEXT: vsub.vx v12, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv16i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu -; RV64-NEXT: vsub.vx v12, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv16i16( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv16i16(, i1) - -define @cttz_nxv32i16( %va) { -; RV32-LABEL: cttz_nxv32i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m8, ta, mu -; RV32-NEXT: vsub.vx v16, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv32i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m8, ta, mu -; RV64-NEXT: vsub.vx v16, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv32i16( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv32i16(, i1) - -define @cttz_nxv1i32( %va) { -; RV32-LABEL: cttz_nxv1i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv1i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv1i32( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv1i32(, i1) - -define @cttz_nxv2i32( %va) { -; RV32-LABEL: cttz_nxv2i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv2i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv2i32( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv2i32(, i1) - -define @cttz_nxv4i32( %va) { -; RV32-LABEL: cttz_nxv4i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, mu -; RV32-NEXT: vsub.vx v10, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv4i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, mu -; RV64-NEXT: vsub.vx v10, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv4i32( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv4i32(, i1) - -define @cttz_nxv8i32( %va) { -; RV32-LABEL: cttz_nxv8i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu -; RV32-NEXT: vsub.vx v12, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv8i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu -; RV64-NEXT: vsub.vx v12, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv8i32( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv8i32(, i1) - -define @cttz_nxv16i32( %va) { -; RV32-LABEL: cttz_nxv16i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, mu -; RV32-NEXT: vsub.vx v16, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vsrl.vi v16, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsub.vv v8, v8, v16 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v16, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v16 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_nxv16i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, mu -; RV64-NEXT: vsub.vx v16, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsub.vv v8, v8, v16 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v16, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v16 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv16i32( %va, i1 false) - ret %a -} -declare @llvm.cttz.nxv16i32(, i1) - -define @cttz_nxv1i64( %va) { -; RV32-LABEL: cttz_nxv1i64: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: lui a0, 349525 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: lui a0, 349525 ; RV32-NEXT: addi a0, a0, 1365 ; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: sw a0, 8(sp) @@ -1290,126 +1925,401 @@ declare @llvm.cttz.nxv8i64(, i1) define @cttz_zero_undef_nxv1i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv1i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv1i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv1i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, mf8, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv1i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v9 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv1i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, mf8, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v9 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf8, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv1i8( %va, i1 true) ret %a } define @cttz_zero_undef_nxv2i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv2i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv2i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, mf4, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv2i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vzext.vf4 v9, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v9 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv2i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vzext.vf4 v9, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v9 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv2i8( %va, i1 true) ret %a } define @cttz_zero_undef_nxv4i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv4i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv4i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, mf2, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv4i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vzext.vf4 v10, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v10 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v10, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv4i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, mf2, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vzext.vf4 v10, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v10 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v10, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv4i8( %va, i1 true) ret %a } define @cttz_zero_undef_nxv8i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu -; CHECK-NEXT: vsub.vx v9, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vsub.vv v8, v8, v9 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v9, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v9 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv8i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv8i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, m1, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv8i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vzext.vf4 v12, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v12 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v12, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv8i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, m1, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vzext.vf4 v12, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v12 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v12, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv8i8( %va, i1 true) ret %a } define @cttz_zero_undef_nxv16i8( %va) { -; CHECK-LABEL: cttz_zero_undef_nxv16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 1 -; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu -; CHECK-NEXT: vsub.vx v10, v8, a0 -; CHECK-NEXT: vxor.vi v8, v8, -1 -; CHECK-NEXT: vand.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 1 -; CHECK-NEXT: addi a0, zero, 85 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: addi a0, zero, 51 -; CHECK-NEXT: vand.vx v10, v8, a0 -; CHECK-NEXT: vsrl.vi v8, v8, 2 -; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vadd.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 4 -; CHECK-NEXT: vadd.vv v8, v8, v10 -; CHECK-NEXT: vand.vi v8, v8, 15 -; CHECK-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv16i8: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: addi a0, zero, 85 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: addi a0, zero, 51 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vand.vi v8, v8, 15 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv16i8: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e8, m2, ta, mu +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: addi a0, zero, 85 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: addi a0, zero, 51 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vand.vi v8, v8, 15 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv16i8: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v10 +; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV32D-NEXT: vzext.vf4 v16, v8 +; RV32D-NEXT: vfcvt.f.xu.v v8, v16 +; RV32D-NEXT: vsrl.vi v8, v8, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v16, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv16i8: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e8, m2, ta, mu +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v10 +; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV64D-NEXT: vzext.vf4 v16, v8 +; RV64D-NEXT: vfcvt.f.xu.v v8, v16 +; RV64D-NEXT: vsrl.vi v8, v8, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v16, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv16i8( %va, i1 true) ret %a } @@ -1465,301 +2375,441 @@ } define @cttz_zero_undef_nxv1i16( %va) { -; RV32-LABEL: cttz_zero_undef_nxv1i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, mf4, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv1i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv1i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, mf4, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv1i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, mf4, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv1i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vsrl.vi v8, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv1i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf4, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vsrl.vi v8, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv1i16( %va, i1 true) ret %a } define @cttz_zero_undef_nxv2i16( %va) { -; RV32-LABEL: cttz_zero_undef_nxv2i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, mf2, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret -; -; RV64-LABEL: cttz_zero_undef_nxv2i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, mf2, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv2i16( %va, i1 true) - ret %a -} - -define @cttz_zero_undef_nxv4i16( %va) { -; RV32-LABEL: cttz_zero_undef_nxv4i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv2i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv4i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret - %a = call @llvm.cttz.nxv4i16( %va, i1 true) - ret %a -} - -define @cttz_zero_undef_nxv8i16( %va) { -; RV32-LABEL: cttz_zero_undef_nxv8i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; RV32-NEXT: vsub.vx v10, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv2i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv8i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m2, ta, mu -; RV64-NEXT: vsub.vx v10, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret +; RV32D-LABEL: cttz_zero_undef_nxv2i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vsrl.vi v8, v9, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv2i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vsrl.vi v8, v9, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv2i16( %va, i1 true) + ret %a +} + +define @cttz_zero_undef_nxv4i16( %va) { +; RV32I-LABEL: cttz_zero_undef_nxv4i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv4i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, m1, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv4i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vsrl.vi v8, v10, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v10, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv4i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m1, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vsrl.vi v8, v10, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v10, a0 +; RV64D-NEXT: ret + %a = call @llvm.cttz.nxv4i16( %va, i1 true) + ret %a +} + +define @cttz_zero_undef_nxv8i16( %va) { +; RV32I-LABEL: cttz_zero_undef_nxv8i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret +; +; RV64I-LABEL: cttz_zero_undef_nxv8i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, m2, ta, mu +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv8i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v10 +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vsrl.vi v8, v12, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v12, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv8i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m2, ta, mu +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v10 +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vsrl.vi v8, v12, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v12, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv8i16( %va, i1 true) ret %a } define @cttz_zero_undef_nxv16i16( %va) { -; RV32-LABEL: cttz_zero_undef_nxv16i16: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e16, m4, ta, mu -; RV32-NEXT: vsub.vx v12, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 5 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 3 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 1 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: addi a0, zero, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 8 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv16i16: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 5 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 3 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 1 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: addi a0, zero, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 8 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv16i16: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e16, m4, ta, mu -; RV64-NEXT: vsub.vx v12, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 5 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 3 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 1 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: addi a0, zero, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 8 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv16i16: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e16, m4, ta, mu +; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 5 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 3 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 1 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: addi a0, zero, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 8 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv16i16: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV32D-NEXT: vrsub.vi v12, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v12 +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV32D-NEXT: vsrl.vi v8, v16, 23 +; RV32D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: addi a0, zero, 127 +; RV32D-NEXT: vsub.vx v8, v16, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv16i16: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e16, m4, ta, mu +; RV64D-NEXT: vrsub.vi v12, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v12 +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; RV64D-NEXT: vsrl.vi v8, v16, 23 +; RV64D-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: addi a0, zero, 127 +; RV64D-NEXT: vsub.vx v8, v16, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv16i16( %va, i1 true) ret %a } @@ -1825,249 +2875,369 @@ } define @cttz_zero_undef_nxv1i32( %va) { -; RV32-LABEL: cttz_zero_undef_nxv1i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, mf2, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv1i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv1i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, mf2, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv1i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, mf2, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv1i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32D-NEXT: vsrl.vx v8, v9, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV32D-NEXT: vnsrl.wi v8, v8, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v8, v8, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv1i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, mf2, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v9, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64D-NEXT: vsrl.vx v8, v9, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; RV64D-NEXT: vnsrl.wi v8, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v8, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv1i32( %va, i1 true) ret %a } define @cttz_zero_undef_nxv2i32( %va) { -; RV32-LABEL: cttz_zero_undef_nxv2i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; RV32-NEXT: vsub.vx v9, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vsub.vv v8, v8, v9 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v9, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v9 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv2i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; RV32I-NEXT: vsub.vx v9, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v9 +; RV32I-NEXT: vsrl.vi v9, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v9, v9, a0 +; RV32I-NEXT: vsub.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v9, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v9, v8 +; RV32I-NEXT: vsrl.vi v9, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv2i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; RV64-NEXT: vsub.vx v9, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsub.vv v8, v8, v9 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v9, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v9 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv2i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; RV64I-NEXT: vsub.vx v9, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v9 +; RV64I-NEXT: vsrl.vi v9, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v9, v9, a0 +; RV64I-NEXT: vsub.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v9, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v9, v8 +; RV64I-NEXT: vsrl.vi v9, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv2i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV32D-NEXT: vrsub.vi v9, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v9 +; RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32D-NEXT: vsrl.vx v8, v10, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV32D-NEXT: vnsrl.wi v10, v8, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v8, v10, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv2i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; RV64D-NEXT: vrsub.vi v9, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v9 +; RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64D-NEXT: vsrl.vx v8, v10, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; RV64D-NEXT: vnsrl.wi v10, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v10, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv2i32( %va, i1 true) ret %a } define @cttz_zero_undef_nxv4i32( %va) { -; RV32-LABEL: cttz_zero_undef_nxv4i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m2, ta, mu -; RV32-NEXT: vsub.vx v10, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v10, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v10 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv4i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; RV32I-NEXT: vsub.vx v10, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v10, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v10, v10, a0 +; RV32I-NEXT: vsub.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v10, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v10, v8 +; RV32I-NEXT: vsrl.vi v10, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv4i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m2, ta, mu -; RV64-NEXT: vsub.vx v10, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsub.vv v8, v8, v10 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v10, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv4i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; RV64I-NEXT: vsub.vx v10, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v10, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v10, v10, a0 +; RV64I-NEXT: vsub.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v10, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v10, v8 +; RV64I-NEXT: vsrl.vi v10, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv4i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV32D-NEXT: vrsub.vi v10, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v10 +; RV32D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32D-NEXT: vsrl.vx v8, v12, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV32D-NEXT: vnsrl.wi v12, v8, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v8, v12, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv4i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m2, ta, mu +; RV64D-NEXT: vrsub.vi v10, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v10 +; RV64D-NEXT: vfwcvt.f.xu.v v12, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64D-NEXT: vsrl.vx v8, v12, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; RV64D-NEXT: vnsrl.wi v12, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v12, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv4i32( %va, i1 true) ret %a } define @cttz_zero_undef_nxv8i32( %va) { -; RV32-LABEL: cttz_zero_undef_nxv8i32: -; RV32: # %bb.0: -; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: vsetvli a1, zero, e32, m4, ta, mu -; RV32-NEXT: vsub.vx v12, v8, a0 -; RV32-NEXT: vxor.vi v8, v8, -1 -; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 1 -; RV32-NEXT: lui a0, 349525 -; RV32-NEXT: addi a0, a0, 1365 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsub.vv v8, v8, v12 -; RV32-NEXT: lui a0, 209715 -; RV32-NEXT: addi a0, a0, 819 -; RV32-NEXT: vand.vx v12, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: vadd.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v12 -; RV32-NEXT: lui a0, 61681 -; RV32-NEXT: addi a0, a0, -241 -; RV32-NEXT: vand.vx v8, v8, a0 -; RV32-NEXT: lui a0, 4112 -; RV32-NEXT: addi a0, a0, 257 -; RV32-NEXT: vmul.vx v8, v8, a0 -; RV32-NEXT: vsrl.vi v8, v8, 24 -; RV32-NEXT: ret +; RV32I-LABEL: cttz_zero_undef_nxv8i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a0, zero, 1 +; RV32I-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV32I-NEXT: vsub.vx v12, v8, a0 +; RV32I-NEXT: vxor.vi v8, v8, -1 +; RV32I-NEXT: vand.vv v8, v8, v12 +; RV32I-NEXT: vsrl.vi v12, v8, 1 +; RV32I-NEXT: lui a0, 349525 +; RV32I-NEXT: addi a0, a0, 1365 +; RV32I-NEXT: vand.vx v12, v12, a0 +; RV32I-NEXT: vsub.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 209715 +; RV32I-NEXT: addi a0, a0, 819 +; RV32I-NEXT: vand.vx v12, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 2 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: vadd.vv v8, v12, v8 +; RV32I-NEXT: vsrl.vi v12, v8, 4 +; RV32I-NEXT: vadd.vv v8, v8, v12 +; RV32I-NEXT: lui a0, 61681 +; RV32I-NEXT: addi a0, a0, -241 +; RV32I-NEXT: vand.vx v8, v8, a0 +; RV32I-NEXT: lui a0, 4112 +; RV32I-NEXT: addi a0, a0, 257 +; RV32I-NEXT: vmul.vx v8, v8, a0 +; RV32I-NEXT: vsrl.vi v8, v8, 24 +; RV32I-NEXT: ret ; -; RV64-LABEL: cttz_zero_undef_nxv8i32: -; RV64: # %bb.0: -; RV64-NEXT: addi a0, zero, 1 -; RV64-NEXT: vsetvli a1, zero, e32, m4, ta, mu -; RV64-NEXT: vsub.vx v12, v8, a0 -; RV64-NEXT: vxor.vi v8, v8, -1 -; RV64-NEXT: vand.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 1 -; RV64-NEXT: lui a0, 349525 -; RV64-NEXT: addiw a0, a0, 1365 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsub.vv v8, v8, v12 -; RV64-NEXT: lui a0, 209715 -; RV64-NEXT: addiw a0, a0, 819 -; RV64-NEXT: vand.vx v12, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 2 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vadd.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 4 -; RV64-NEXT: vadd.vv v8, v8, v12 -; RV64-NEXT: lui a0, 61681 -; RV64-NEXT: addiw a0, a0, -241 -; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: lui a0, 4112 -; RV64-NEXT: addiw a0, a0, 257 -; RV64-NEXT: vmul.vx v8, v8, a0 -; RV64-NEXT: vsrl.vi v8, v8, 24 -; RV64-NEXT: ret +; RV64I-LABEL: cttz_zero_undef_nxv8i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a0, zero, 1 +; RV64I-NEXT: vsetvli a1, zero, e32, m4, ta, mu +; RV64I-NEXT: vsub.vx v12, v8, a0 +; RV64I-NEXT: vxor.vi v8, v8, -1 +; RV64I-NEXT: vand.vv v8, v8, v12 +; RV64I-NEXT: vsrl.vi v12, v8, 1 +; RV64I-NEXT: lui a0, 349525 +; RV64I-NEXT: addiw a0, a0, 1365 +; RV64I-NEXT: vand.vx v12, v12, a0 +; RV64I-NEXT: vsub.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 209715 +; RV64I-NEXT: addiw a0, a0, 819 +; RV64I-NEXT: vand.vx v12, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 2 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: vadd.vv v8, v12, v8 +; RV64I-NEXT: vsrl.vi v12, v8, 4 +; RV64I-NEXT: vadd.vv v8, v8, v12 +; RV64I-NEXT: lui a0, 61681 +; RV64I-NEXT: addiw a0, a0, -241 +; RV64I-NEXT: vand.vx v8, v8, a0 +; RV64I-NEXT: lui a0, 4112 +; RV64I-NEXT: addiw a0, a0, 257 +; RV64I-NEXT: vmul.vx v8, v8, a0 +; RV64I-NEXT: vsrl.vi v8, v8, 24 +; RV64I-NEXT: ret +; +; RV32D-LABEL: cttz_zero_undef_nxv8i32: +; RV32D: # %bb.0: +; RV32D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV32D-NEXT: vrsub.vi v12, v8, 0 +; RV32D-NEXT: vand.vv v8, v8, v12 +; RV32D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV32D-NEXT: addi a0, zero, 52 +; RV32D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32D-NEXT: vsrl.vx v8, v16, a0 +; RV32D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV32D-NEXT: vnsrl.wi v16, v8, 0 +; RV32D-NEXT: addi a0, zero, 1023 +; RV32D-NEXT: vsub.vx v8, v16, a0 +; RV32D-NEXT: ret +; +; RV64D-LABEL: cttz_zero_undef_nxv8i32: +; RV64D: # %bb.0: +; RV64D-NEXT: vsetvli a0, zero, e32, m4, ta, mu +; RV64D-NEXT: vrsub.vi v12, v8, 0 +; RV64D-NEXT: vand.vv v8, v8, v12 +; RV64D-NEXT: vfwcvt.f.xu.v v16, v8 +; RV64D-NEXT: addi a0, zero, 52 +; RV64D-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64D-NEXT: vsrl.vx v8, v16, a0 +; RV64D-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; RV64D-NEXT: vnsrl.wi v16, v8, 0 +; RV64D-NEXT: addi a0, zero, 1023 +; RV64D-NEXT: vsub.vx v8, v16, a0 +; RV64D-NEXT: ret %a = call @llvm.cttz.nxv8i32( %va, i1 true) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -1,8 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32I +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64I ; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32D +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64D +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV64 define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind { ; LMULMAX2-RV32-LABEL: ctlz_v16i8: @@ -108,6 +114,42 @@ ; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15 ; LMULMAX1-RV64-NEXT: vse8.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v16i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v12, v8 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 134 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vrsub.vx v8, v9, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v16i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v12, v8 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 134 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vrsub.vx v8, v9, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = load <16 x i8>, <16 x i8>* %y %c = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) @@ -117,75 +159,75 @@ declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v8i16: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 5 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 3 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 1 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v8i16: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 5 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 3 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 1 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v8i16: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 5 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 3 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 1 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v8i16: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 5 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 3 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 1 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v8i16: ; LMULMAX1-RV32: # %bb.0: @@ -256,6 +298,62 @@ ; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8 ; LMULMAX1-RV64-NEXT: vse16.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: ctlz_v8i16: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV32D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 142 +; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v8i16: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV64D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 142 +; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v8i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 142 +; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v8i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 142 +; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y %c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) @@ -265,81 +363,81 @@ declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v4i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v4i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v4i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v4i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; ; LMULMAX1-RV32-LABEL: ctlz_v4i32: ; LMULMAX1-RV32: # %bb.0: @@ -416,6 +514,66 @@ ; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 ; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: ctlz_v4i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV32D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1054 +; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v4i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV64D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1054 +; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v4i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v4i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y %c = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) @@ -666,6 +824,127 @@ ; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v2i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsrl.vx v9, v8, a1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v9, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v2i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsrl.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) @@ -820,6 +1099,44 @@ ; LMULMAX1-RV64-NEXT: vse8.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse8.v v8, (a1) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v32i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v16, v8 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 134 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vrsub.vx v8, v10, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v32i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v16, v8 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 134 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vrsub.vx v8, v10, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = load <32 x i8>, <32 x i8>* %y %c = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false) @@ -1016,6 +1333,34 @@ ; LMULMAX1-RV64-NEXT: vse16.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse16.v v8, (a1) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v16i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 142 +; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v16i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 142 +; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y %c = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false) @@ -1228,6 +1573,36 @@ ; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v8i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v8i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y %c = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false) @@ -1534,6 +1909,127 @@ ; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v8, (a7) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v4i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v4i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y %c = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -1,8 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32I +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64I ; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32D +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64D +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV64 define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind { ; LMULMAX2-RV32-LABEL: cttz_v16i8: @@ -96,6 +102,46 @@ ; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15 ; LMULMAX1-RV64-NEXT: vse8.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v16i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v12, v9 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vsub.vx v8, v9, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v16i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v12, v9 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vsub.vx v8, v9, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = load <16 x i8>, <16 x i8>* %y %c = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) @@ -105,65 +151,65 @@ declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v8i16: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 5 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 3 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 1 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v8i16: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 5 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 3 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 1 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v8i16: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 5 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 3 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 1 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v8i16: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 5 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 3 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 1 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v8i16: ; LMULMAX1-RV32: # %bb.0: @@ -224,6 +270,70 @@ ; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8 ; LMULMAX1-RV64-NEXT: vse16.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: cttz_v8i16: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV32D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 127 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v8i16: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV64D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 127 +; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v8i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v8i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y %c = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) @@ -233,67 +343,67 @@ declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v4i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v4i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v4i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v4i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; ; LMULMAX1-RV32-LABEL: cttz_v4i32: ; LMULMAX1-RV32: # %bb.0: @@ -356,6 +466,74 @@ ; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 ; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: cttz_v4i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV32D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1023 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v4i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV64D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1023 +; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v4i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v4i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y %c = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) @@ -566,6 +744,107 @@ ; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v2i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: addi a1, zero, 1 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v8, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v2i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: addi a1, zero, 1 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) @@ -700,6 +979,48 @@ ; LMULMAX1-RV64-NEXT: vse8.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse8.v v8, (a1) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v32i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v16, v10 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vsub.vx v8, v10, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v32i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v16, v10 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vsub.vx v8, v10, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = load <32 x i8>, <32 x i8>* %y %c = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false) @@ -864,6 +1185,38 @@ ; LMULMAX1-RV64-NEXT: vse16.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse16.v v8, (a1) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v16i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v16i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y %c = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false) @@ -1032,6 +1385,40 @@ ; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v8i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v8i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y %c = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false) @@ -1278,6 +1665,107 @@ ; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v8, (a7) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v4i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: addi a1, zero, 1 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v8, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v12, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v12 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v4i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: addi a1, zero, 1 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y %c = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)