diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7063,14 +7063,15 @@ } // If the ZERO_UNDEF version is supported use that and handle the zero case. - if (isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { + if (Node->getOpcode() == ISD::CTLZ && + isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); - Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, - DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ); + Result = DAG.getSelect(dl, VT, SrcIsZero, + DAG.getConstant(NumBitsPerElt, dl, VT), CTLZ); return true; } @@ -7117,14 +7118,15 @@ } // If the ZERO_UNDEF version is supported use that and handle the zero case. - if (isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { + if (Node->getOpcode() == ISD::CTTZ && + isOperationLegalOrCustom(ISD::CTTZ_ZERO_UNDEF, VT)) { EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue CTTZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, VT, Op); SDValue Zero = DAG.getConstant(0, dl, VT); SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ); - Result = DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero, - DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ); + Result = DAG.getSelect(dl, VT, SrcIsZero, + DAG.getConstant(NumBitsPerElt, dl, VT), CTTZ); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -819,6 +819,16 @@ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + // Lower CTLZ_UNDEF if we have a floating point type that can represent + // the value exactly. + if (((VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16) && + Subtarget.hasStdExtF()) || + (VT.getVectorElementType() == MVT::i32 && Subtarget.hasStdExtD())) { + setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } + for (unsigned VPOpc : IntegerVPOps) setOperationAction(VPOpc, VT, Custom); } @@ -2278,6 +2288,57 @@ return DAG.getNode(RVVOpc, DL, ContainerVT, Op, Mask, VL); } +// Lower CTLZ_ZERO_UNDEF or CTTZ_ZERO_UNDEF by converting to FP and extracting +// the exponent. +static SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + unsigned EltSize = VT.getScalarSizeInBits(); + SDValue Src = Op.getOperand(0); + SDLoc DL(Op); + + // We need a FP type that can represent the value. + MVT FloatEltVT = EltSize == 32 ? MVT::f64 : MVT::f32; + EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), FloatEltVT, + VT.getVectorElementCount()); + + // Make sure the float type is legal. If not, fallback to expand. + // TODO: Splitting may make sense in some cases. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(FloatVT)) + return SDValue(); + + // For CTTZ_ZERO_UNDEF, we need to extract the lowest set bit using X & -X. + // The trailing zero count is equal to log2 of this single bit value. + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) { + SDValue Neg = + DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); + Src = DAG.getNode(ISD::AND, DL, VT, Src, Neg); + } + + // We have a legal FP type, convert to it. + SDValue FloatVal = DAG.getNode(ISD::UINT_TO_FP, DL, FloatVT, Src); + // Bitcast to integer and shift the exponent to the LSB. + EVT IntVT = FloatVT.changeVectorElementTypeToInteger(); + SDValue Bitcast = DAG.getBitcast(IntVT, FloatVal); + unsigned ShiftAmt = FloatEltVT == MVT::f64 ? 52 : 23; + SDValue Shift = DAG.getNode(ISD::SRL, DL, IntVT, Bitcast, + DAG.getConstant(ShiftAmt, DL, IntVT)); + // Truncate back to original type to allow vnsrl. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Shift); + // The exponent contains log2 of the value in biased form. + unsigned ExponentBias = FloatEltVT == MVT::f64 ? 1023 : 127; + + // For trailing zeros, we just need to subtract the bias. + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) + return DAG.getNode(ISD::SUB, DL, VT, Trunc, + DAG.getConstant(ExponentBias, DL, VT)); + + // For leading zeros, we need to remove the bias and convert from log2 to + // leading zeros. We can do this by subtracting from (Bias + (EltSize - 1)). + unsigned Adjust = ExponentBias + (EltSize - 1); + return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(Adjust, DL, VT), Trunc); +} + // While RVV has alignment restrictions, we should always be able to load as a // legal equivalently-sized byte-typed vector instead. This method is // responsible for re-expressing a ISD::LOAD via a correctly-aligned type. If @@ -2892,6 +2953,9 @@ return lowerToScalableOp(Op, DAG, RISCVISD::FMAXNUM_VL); case ISD::ABS: return lowerABS(Op, DAG); + case ISD::CTLZ_ZERO_UNDEF: + case ISD::CTTZ_ZERO_UNDEF: + return lowerCTLZ_CTTZ_ZERO_UNDEF(Op, DAG); case ISD::VSELECT: return lowerFixedLengthVectorSelectToRVV(Op, DAG); case ISD::FCOPYSIGN: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -1,113 +1,241 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32I +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64I +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32,LMULMAX1-RV32I +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64,LMULMAX1-RV32I +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32D +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64D +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32,LMULMAX1-RV32D +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64,LMULMAX1-RV64D +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV64 define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v16i8: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v16i8: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV32I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v16i8: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v16i8: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV64I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v16i8: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV32-NEXT: vle8.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: addi a1, zero, 85 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: addi a1, zero, 51 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX1-RV32I-LABEL: ctlz_v16i8: +; LMULMAX1-RV32I: # %bb.0: +; LMULMAX1-RV32I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32I-NEXT: vle8.v v8, (a0) +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX1-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX1-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV32I-NEXT: addi a1, zero, 85 +; LMULMAX1-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: addi a1, zero, 51 +; LMULMAX1-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX1-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX1-RV32I-NEXT: vse8.v v8, (a0) +; LMULMAX1-RV32I-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v16i8: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV64-NEXT: vle8.v v8, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: addi a1, zero, 85 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: addi a1, zero, 51 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV32D-LABEL: ctlz_v16i8: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32D-NEXT: vor.vv v9, v8, v9 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX2-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX2-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vi v8, v9, 15 +; LMULMAX2-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV32D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v16i8: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64D-NEXT: vor.vv v9, v8, v9 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX2-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX2-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX2-RV64D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vi v8, v9, 15 +; LMULMAX2-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV64D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: ctlz_v16i8: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32D-NEXT: vle8.v v8, (a0) +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v8, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 85 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 51 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: vand.vi v8, v9, 15 +; LMULMAX1-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV32D-NEXT: vse8.v v8, (a0) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: ctlz_v16i8: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV64D-NEXT: vle8.v v8, (a0) +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v8, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 85 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 51 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: vand.vi v8, v9, 15 +; LMULMAX1-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV64D-NEXT: vse8.v v8, (a0) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v16i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v12, v8 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 134 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vrsub.vx v8, v9, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v16i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v12, v8 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 134 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vrsub.vx v8, v9, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = load <16 x i8>, <16 x i8>* %y %c = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) @@ -117,145 +245,207 @@ declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v8i16: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 5 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 3 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 1 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v8i16: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 5 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 3 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 1 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v8i16: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 5 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 3 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 1 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v8i16: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 5 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 3 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 1 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v8i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 5 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 3 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 1 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: addi a1, zero, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: ctlz_v8i16: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV32D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 142 +; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v8i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: lui a1, 5 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 3 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 1 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: addi a1, zero, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: ctlz_v8i16: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV64D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 142 +; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: ctlz_v8i16: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v8, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 8 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: lui a1, 5 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 3 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 1 +; LMULMAX1-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 8 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: ctlz_v8i16: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v8, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 8 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: lui a1, 5 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 3 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 1 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 8 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v8i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 142 +; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v8i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 142 +; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y %c = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) @@ -265,157 +455,223 @@ declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v4i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v4i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v4i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v4i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: ctlz_v4i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV32D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1054 +; LMULMAX2-RV32D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 16 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: lui a1, 349525 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 209715 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 61681 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: ctlz_v4i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV64D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1054 +; LMULMAX2-RV64D-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: ctlz_v4i32: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v8, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 8 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 16 +; LMULMAX1-RV32D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: lui a1, 349525 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 209715 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 61681 +; LMULMAX1-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: lui a1, 4112 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 24 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: ctlz_v4i32: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v8, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 8 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 16 +; LMULMAX1-RV64D-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: lui a1, 349525 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 209715 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 61681 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: lui a1, 4112 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 24 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v4i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV32-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v4i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v8 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV64-NEXT: vrsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y %c = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) @@ -660,166 +916,437 @@ ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 ; LMULMAX1-RV64-NEXT: addi a1, a1, 257 ; LMULMAX1-RV64-NEXT: slli a1, a1, 16 -; LMULMAX1-RV64-NEXT: addi a1, a1, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: addi a1, zero, 56 -; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret - %a = load <2 x i64>, <2 x i64>* %x - %b = load <2 x i64>, <2 x i64>* %y - %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) - store <2 x i64> %c, <2 x i64>* %x - ret void -} -declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) - -define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v32i8: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; LMULMAX2-RV32-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret -; -; LMULMAX2-RV64-LABEL: ctlz_v32i8: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; LMULMAX2-RV64-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret -; -; LMULMAX1-RV32-LABEL: ctlz_v32i8: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle8.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle8.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: addi a2, zero, 85 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: addi a3, zero, 51 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vi v9, v9, 15 -; LMULMAX1-RV32-NEXT: vse8.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse8.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: ctlz_v32i8: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle8.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle8.v v9, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: addi a2, zero, 85 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: addi a3, zero, 51 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vi v9, v9, 15 -; LMULMAX1-RV64-NEXT: vse8.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV64-NEXT: addi a1, a1, 257 +; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX1-RV64-NEXT: addi a1, zero, 56 +; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v2i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsrl.vx v9, v8, a1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v9, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v2i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 2 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 8 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 16 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsrl.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) + store <2 x i64> %c, <2 x i64>* %x + ret void +} +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) + +define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) nounwind { +; LMULMAX2-RV32I-LABEL: ctlz_v32i8: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32I-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV32I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV32I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret +; +; LMULMAX2-RV64I-LABEL: ctlz_v32i8: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64I-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV64I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV64I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret +; +; LMULMAX1-RV32I-LABEL: ctlz_v32i8: +; LMULMAX1-RV32I: # %bb.0: +; LMULMAX1-RV32I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32I-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32I-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV32I-NEXT: vle8.v v9, (a0) +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX1-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX1-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32I-NEXT: addi a2, zero, 85 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v10, a2 +; LMULMAX1-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: addi a3, zero, 51 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v8, a3 +; LMULMAX1-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32I-NEXT: vand.vx v8, v8, a3 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32I-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 2 +; LMULMAX1-RV32I-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32I-NEXT: vor.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v10, a2 +; LMULMAX1-RV32I-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v9, a3 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32I-NEXT: vand.vx v9, v9, a3 +; LMULMAX1-RV32I-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32I-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vand.vi v9, v9, 15 +; LMULMAX1-RV32I-NEXT: vse8.v v9, (a0) +; LMULMAX1-RV32I-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV32I-NEXT: ret +; +; LMULMAX2-RV32D-LABEL: ctlz_v32i8: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV32D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v8, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 2 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vi v8, v10, 15 +; LMULMAX2-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV32D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: ctlz_v32i8: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV64D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v8, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 2 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vi v8, v10, 15 +; LMULMAX2-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV64D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: ctlz_v32i8: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32D-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV32D-NEXT: vle8.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v8, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: addi a2, zero, 85 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: addi a3, zero, 51 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: vand.vi v8, v10, 15 +; LMULMAX1-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV32D-NEXT: vand.vi v9, v10, 15 +; LMULMAX1-RV32D-NEXT: vmerge.vim v9, v9, 8, v0 +; LMULMAX1-RV32D-NEXT: vse8.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: ctlz_v32i8: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV64D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64D-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV64D-NEXT: vle8.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v8, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: addi a2, zero, 85 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: addi a3, zero, 51 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: vand.vi v8, v10, 15 +; LMULMAX1-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64D-NEXT: vand.vi v9, v10, 15 +; LMULMAX1-RV64D-NEXT: vmerge.vim v9, v9, 8, v0 +; LMULMAX1-RV64D-NEXT: vse8.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v32i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v16, v8 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 134 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vrsub.vx v8, v10, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v32i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v16, v8 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 134 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vrsub.vx v8, v10, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = load <32 x i8>, <32 x i8>* %y %c = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %a, i1 false) @@ -829,193 +1356,307 @@ declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>, i1) define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v16i16: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 5 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 3 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 1 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v16i16: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 5 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 3 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 1 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v16i16: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 5 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 3 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 1 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v16i16: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 5 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 3 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 1 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v16i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle16.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle16.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a2, 5 -; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a3, 3 -; LMULMAX1-RV32-NEXT: addi a3, a3, 819 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a4, 1 -; LMULMAX1-RV32-NEXT: addi a4, a4, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: addi a5, zero, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 8 -; LMULMAX1-RV32-NEXT: vse16.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse16.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: ctlz_v16i16: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v8, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 2 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 8 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV32D-NEXT: lui a1, 5 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 3 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 1 +; LMULMAX2-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v16i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle16.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle16.v v9, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: lui a2, 5 -; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a3, 3 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a4, 1 -; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: addi a5, zero, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 8 -; LMULMAX1-RV64-NEXT: vse16.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse16.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: ctlz_v16i16: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v8, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 2 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 8 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV64D-NEXT: lui a1, 5 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 3 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 1 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: ctlz_v16i16: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV32D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32D-NEXT: vle16.v v8, (a1) +; LMULMAX1-RV32D-NEXT: vle16.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v8, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: lui a2, 5 +; LMULMAX1-RV32D-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a3, 3 +; LMULMAX1-RV32D-NEXT: addi a3, a3, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a4, 1 +; LMULMAX1-RV32D-NEXT: addi a4, a4, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: addi a5, zero, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a6, zero, 16 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v10, a6, v0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v10, 8 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v9, v9, a6, v0 +; LMULMAX1-RV32D-NEXT: vse16.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vse16.v v8, (a1) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: ctlz_v16i16: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV64D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64D-NEXT: vle16.v v8, (a1) +; LMULMAX1-RV64D-NEXT: vle16.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v8, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: lui a2, 5 +; LMULMAX1-RV64D-NEXT: addiw a2, a2, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a3, 3 +; LMULMAX1-RV64D-NEXT: addiw a3, a3, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a4, 1 +; LMULMAX1-RV64D-NEXT: addiw a4, a4, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: addi a5, zero, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a6, zero, 16 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v10, a6, v0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v10, 8 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v9, v9, a6, v0 +; LMULMAX1-RV64D-NEXT: vse16.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vse16.v v8, (a1) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v16i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 142 +; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v16i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 142 +; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y %c = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %a, i1 false) @@ -1025,209 +1666,331 @@ declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>, i1) define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) nounwind { -; LMULMAX2-RV32-LABEL: ctlz_v8i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: ctlz_v8i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX2-RV32I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: ctlz_v8i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX2-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: ctlz_v8i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX2-RV64I-NEXT: vor.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: ctlz_v8i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX1-RV32-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a2, 349525 -; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a3, 209715 -; LMULMAX1-RV32-NEXT: addi a3, a3, 819 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a4, 61681 -; LMULMAX1-RV32-NEXT: addi a4, a4, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: lui a5, 4112 -; LMULMAX1-RV32-NEXT: addi a5, a5, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 16 -; LMULMAX1-RV32-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: ctlz_v8i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v8, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 2 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 8 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 16 +; LMULMAX2-RV32D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV32D-NEXT: lui a1, 349525 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 209715 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 61681 +; LMULMAX2-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: lui a1, 4112 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: ctlz_v8i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 2 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 8 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 16 -; LMULMAX1-RV64-NEXT: vor.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: lui a2, 349525 -; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a3, 209715 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a4, 61681 -; LMULMAX1-RV64-NEXT: addiw a4, a4, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: lui a5, 4112 -; LMULMAX1-RV64-NEXT: addiw a5, a5, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 2 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 8 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 16 -; LMULMAX1-RV64-NEXT: vor.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a2 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a3 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a3 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: ctlz_v8i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v8, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 2 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 8 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 16 +; LMULMAX2-RV64D-NEXT: vor.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV64D-NEXT: lui a1, 349525 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 209715 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 61681 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: lui a1, 4112 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: ctlz_v8i32: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV32D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32D-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV32D-NEXT: vle32.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v8, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 16 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: lui a2, 349525 +; LMULMAX1-RV32D-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a3, 209715 +; LMULMAX1-RV32D-NEXT: addi a3, a3, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a4, 61681 +; LMULMAX1-RV32D-NEXT: addi a4, a4, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: lui a5, 4112 +; LMULMAX1-RV32D-NEXT: addi a5, a5, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a6, zero, 32 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v10, a6, v0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v9, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 16 +; LMULMAX1-RV32D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v10, 24 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v9, v9, a6, v0 +; LMULMAX1-RV32D-NEXT: vse32.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: ctlz_v8i32: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV64D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64D-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV64D-NEXT: vle32.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v8, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 16 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: lui a2, 349525 +; LMULMAX1-RV64D-NEXT: addiw a2, a2, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a3, 209715 +; LMULMAX1-RV64D-NEXT: addiw a3, a3, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a4, 61681 +; LMULMAX1-RV64D-NEXT: addiw a4, a4, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: lui a5, 4112 +; LMULMAX1-RV64D-NEXT: addiw a5, a5, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a6, zero, 32 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v10, a6, v0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v9, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 2 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 8 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 16 +; LMULMAX1-RV64D-NEXT: vor.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v10, -1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a2 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a3 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v10, 24 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v9, v9, a6, v0 +; LMULMAX1-RV64D-NEXT: vse32.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v8i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV32-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v8i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v8 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1054 +; LMULMAX8-RV64-NEXT: vrsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y %c = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %a, i1 false) @@ -1534,6 +2297,127 @@ ; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v8, (a7) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: ctlz_v4i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX8-RV32-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: ctlz_v4i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 2 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 8 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 16 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsrl.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vor.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y %c = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -1,101 +1,224 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64 -; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32I +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64I +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32,LMULMAX1-RV32I +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64,LMULMAX1-RV32I +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32,LMULMAX2-RV32D +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64,LMULMAX2-RV64D +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32,LMULMAX1-RV32D +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64,LMULMAX1-RV64D +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v,+d -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX8-RV64 define void @cttz_v16i8(<16 x i8>* %x, <16 x i8>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v16i8: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v16i8: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV32I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v16i8: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v16i8: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV64I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v16i8: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV32-NEXT: vle8.v v8, (a0) -; LMULMAX1-RV32-NEXT: addi a1, zero, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: addi a1, zero, 85 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: addi a1, zero, 51 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX1-RV32I-LABEL: cttz_v16i8: +; LMULMAX1-RV32I: # %bb.0: +; LMULMAX1-RV32I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32I-NEXT: vle8.v v8, (a0) +; LMULMAX1-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX1-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX1-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX1-RV32I-NEXT: addi a1, zero, 85 +; LMULMAX1-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: addi a1, zero, 51 +; LMULMAX1-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX1-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX1-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX1-RV32I-NEXT: vse8.v v8, (a0) +; LMULMAX1-RV32I-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v16i8: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV64-NEXT: vle8.v v8, (a0) -; LMULMAX1-RV64-NEXT: addi a1, zero, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: addi a1, zero, 85 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: addi a1, zero, 51 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV32D-LABEL: cttz_v16i8: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v10, v9 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vi v8, v9, 15 +; LMULMAX2-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV32D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v16i8: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX2-RV64D-NEXT: vand.vv v9, v10, v9 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vi v8, v9, 15 +; LMULMAX2-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV64D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: cttz_v16i8: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32D-NEXT: vle8.v v8, (a0) +; LMULMAX1-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX1-RV32D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 85 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 51 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: vand.vi v8, v9, 15 +; LMULMAX1-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV32D-NEXT: vse8.v v8, (a0) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: cttz_v16i8: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV64D-NEXT: vle8.v v8, (a0) +; LMULMAX1-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX1-RV64D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 85 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 51 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: vand.vi v8, v9, 15 +; LMULMAX1-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV64D-NEXT: vse8.v v8, (a0) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v16i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v12, v9 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vsub.vx v8, v9, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v16i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v12, v9 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v12, v12 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vsub.vx v8, v9, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x %b = load <16 x i8>, <16 x i8>* %y %c = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 false) @@ -105,125 +228,195 @@ declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>, i1) define void @cttz_v8i16(<8 x i16>* %x, <8 x i16>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v8i16: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 5 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 3 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 1 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v8i16: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 5 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 3 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 1 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v8i16: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 5 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 3 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 1 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v8i16: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 5 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 3 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 1 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v8i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX1-RV32-NEXT: addi a1, zero, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 5 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 3 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 1 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: addi a1, zero, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: cttz_v8i16: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV32D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 127 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v8i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX1-RV64-NEXT: addi a1, zero, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: lui a1, 5 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 3 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 1 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: addi a1, zero, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: cttz_v8i16: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV64D-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 127 +; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: cttz_v8i16: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX1-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX1-RV32D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: lui a1, 5 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 3 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 1 +; LMULMAX1-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 8 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: cttz_v8i16: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX1-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX1-RV64D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: lui a1, 5 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 3 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 1 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 8 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v8i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV32-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v8i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV64-NEXT: vnsrl.wi v9, v10, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y %c = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 false) @@ -233,129 +426,203 @@ declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>, i1) define void @cttz_v4i32(<4 x i32>* %x, <4 x i32>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v4i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v4i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v4i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v4i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v9, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v9, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v9, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v9 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV32-NEXT: addi a1, zero, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 209715 -; LMULMAX1-RV32-NEXT: addi a1, a1, 819 -; LMULMAX1-RV32-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV32-NEXT: lui a1, 61681 -; LMULMAX1-RV32-NEXT: addi a1, a1, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: lui a1, 4112 -; LMULMAX1-RV32-NEXT: addi a1, a1, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: cttz_v4i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV32D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV32D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1023 +; LMULMAX2-RV32D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX1-RV64-NEXT: addi a1, zero, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 1 -; LMULMAX1-RV64-NEXT: lui a1, 349525 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 209715 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX1-RV64-NEXT: vand.vx v9, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v9, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v9 -; LMULMAX1-RV64-NEXT: lui a1, 61681 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: lui a1, 4112 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: cttz_v4i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vv v9, v8, v9 +; LMULMAX2-RV64D-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 52 +; LMULMAX2-RV64D-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1023 +; LMULMAX2-RV64D-NEXT: vsub.vx v9, v9, a1 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: cttz_v4i32: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX1-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX1-RV32D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV32D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32D-NEXT: lui a1, 349525 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV32D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 209715 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32D-NEXT: lui a1, 61681 +; LMULMAX1-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: lui a1, 4112 +; LMULMAX1-RV32D-NEXT: addi a1, a1, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v9, 24 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: cttz_v4i32: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX1-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX1-RV64D-NEXT: vsub.vx v9, v8, a1 +; LMULMAX1-RV64D-NEXT: vxor.vi v10, v8, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV64D-NEXT: lui a1, 349525 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX1-RV64D-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 209715 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV64D-NEXT: lui a1, 61681 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: lui a1, 4112 +; LMULMAX1-RV64D-NEXT: addiw a1, a1, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v9, v9, a1 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v9, 24 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX1-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v4i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v4i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v9, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v9, v8, v9 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v10, v9 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v9, v10, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v9, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y %c = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 false) @@ -566,6 +833,107 @@ ; LMULMAX1-RV64-NEXT: vsrl.vx v8, v8, a1 ; LMULMAX1-RV64-NEXT: vse64.v v8, (a0) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v2i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: addi a1, zero, 1 +; LMULMAX8-RV32-NEXT: vsub.vx v9, v8, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v10, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v9, v9, v10 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v9 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v9, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v9 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v2i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: addi a1, zero, 1 +; LMULMAX8-RV64-NEXT: vsub.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v9, v9, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v9, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v9, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v9, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v9 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y %c = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 false) @@ -575,131 +943,272 @@ declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) define void @cttz_v32i8(<32 x i8>* %x, <32 x i8>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v32i8: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; LMULMAX2-RV32-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV32-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v32i8: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32I-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV32I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV32I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v32i8: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; LMULMAX2-RV64-NEXT: vle8.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX2-RV64-NEXT: vse8.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v32i8: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64I-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV64I-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vand.vi v8, v8, 15 +; LMULMAX2-RV64I-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v32i8: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle8.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle8.v v9, (a0) -; LMULMAX1-RV32-NEXT: addi a2, zero, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a2 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: addi a3, zero, 85 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: addi a4, zero, 51 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v9, a2 -; LMULMAX1-RV32-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vi v9, v9, 15 -; LMULMAX1-RV32-NEXT: vse8.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse8.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX1-RV32I-LABEL: cttz_v32i8: +; LMULMAX1-RV32I: # %bb.0: +; LMULMAX1-RV32I-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32I-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32I-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV32I-NEXT: vle8.v v9, (a0) +; LMULMAX1-RV32I-NEXT: addi a2, zero, 1 +; LMULMAX1-RV32I-NEXT: vsub.vx v10, v8, a2 +; LMULMAX1-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX1-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX1-RV32I-NEXT: addi a3, zero, 85 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: addi a4, zero, 51 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v8, a4 +; LMULMAX1-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX1-RV32I-NEXT: vand.vx v8, v8, a4 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX1-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX1-RV32I-NEXT: vand.vi v8, v8, 15 +; LMULMAX1-RV32I-NEXT: vsub.vx v10, v9, a2 +; LMULMAX1-RV32I-NEXT: vxor.vi v9, v9, -1 +; LMULMAX1-RV32I-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 1 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v10, a3 +; LMULMAX1-RV32I-NEXT: vsub.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vand.vx v10, v9, a4 +; LMULMAX1-RV32I-NEXT: vsrl.vi v9, v9, 2 +; LMULMAX1-RV32I-NEXT: vand.vx v9, v9, a4 +; LMULMAX1-RV32I-NEXT: vadd.vv v9, v10, v9 +; LMULMAX1-RV32I-NEXT: vsrl.vi v10, v9, 4 +; LMULMAX1-RV32I-NEXT: vadd.vv v9, v9, v10 +; LMULMAX1-RV32I-NEXT: vand.vi v9, v9, 15 +; LMULMAX1-RV32I-NEXT: vse8.v v9, (a0) +; LMULMAX1-RV32I-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV32I-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v32i8: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle8.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle8.v v9, (a0) -; LMULMAX1-RV64-NEXT: addi a2, zero, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a2 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: addi a3, zero, 85 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: addi a4, zero, 51 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vand.vi v8, v8, 15 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a2 -; LMULMAX1-RV64-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vi v9, v9, 15 -; LMULMAX1-RV64-NEXT: vse8.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse8.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV32D-LABEL: cttz_v32i8: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV32D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32D-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32D-NEXT: vxor.vi v12, v8, -1 +; LMULMAX2-RV32D-NEXT: vand.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: vand.vi v8, v10, 15 +; LMULMAX2-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV32D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret +; +; LMULMAX2-RV64D-LABEL: cttz_v32i8: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX2-RV64D-NEXT: vle8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64D-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64D-NEXT: vxor.vi v12, v8, -1 +; LMULMAX2-RV64D-NEXT: vand.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: vand.vi v8, v10, 15 +; LMULMAX2-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX2-RV64D-NEXT: vse8.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: cttz_v32i8: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV32D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32D-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV32D-NEXT: vle8.v v9, (a0) +; LMULMAX1-RV32D-NEXT: addi a2, zero, 1 +; LMULMAX1-RV32D-NEXT: vsub.vx v10, v8, a2 +; LMULMAX1-RV32D-NEXT: vxor.vi v11, v8, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: addi a3, zero, 85 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: addi a4, zero, 51 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: vand.vi v8, v10, 15 +; LMULMAX1-RV32D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV32D-NEXT: vsub.vx v10, v9, a2 +; LMULMAX1-RV32D-NEXT: vxor.vi v11, v9, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV32D-NEXT: vand.vi v9, v10, 15 +; LMULMAX1-RV32D-NEXT: vmerge.vim v9, v9, 8, v0 +; LMULMAX1-RV32D-NEXT: vse8.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: cttz_v32i8: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 16, e8, m1, ta, mu +; LMULMAX1-RV64D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64D-NEXT: vle8.v v8, (a1) +; LMULMAX1-RV64D-NEXT: vle8.v v9, (a0) +; LMULMAX1-RV64D-NEXT: addi a2, zero, 1 +; LMULMAX1-RV64D-NEXT: vsub.vx v10, v8, a2 +; LMULMAX1-RV64D-NEXT: vxor.vi v11, v8, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: addi a3, zero, 85 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: addi a4, zero, 51 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: vand.vi v8, v10, 15 +; LMULMAX1-RV64D-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX1-RV64D-NEXT: vsub.vx v10, v9, a2 +; LMULMAX1-RV64D-NEXT: vxor.vi v11, v9, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64D-NEXT: vand.vi v9, v10, 15 +; LMULMAX1-RV64D-NEXT: vmerge.vim v9, v9, 8, v0 +; LMULMAX1-RV64D-NEXT: vse8.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vse8.v v8, (a1) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v32i8: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV32-NEXT: vzext.vf4 v16, v10 +; LMULMAX8-RV32-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV32-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: vsub.vx v8, v10, a1 +; LMULMAX8-RV32-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV32-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v32i8: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vsetvli zero, a1, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle8.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e32, m8, ta, mu +; LMULMAX8-RV64-NEXT: vzext.vf4 v16, v10 +; LMULMAX8-RV64-NEXT: vfcvt.f.xu.v v16, v16 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e16, m4, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v12, v16, 23 +; LMULMAX8-RV64-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: vsub.vx v8, v10, a1 +; LMULMAX8-RV64-NEXT: vmerge.vim v8, v8, 8, v0 +; LMULMAX8-RV64-NEXT: vse8.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = load <32 x i8>, <32 x i8>* %y %c = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 false) @@ -709,161 +1218,269 @@ declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>, i1) define void @cttz_v16i16(<16 x i16>* %x, <16 x i16>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v16i16: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; LMULMAX2-RV32-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 5 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 3 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 1 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV32-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v16i16: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV32I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 5 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 3 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 1 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV32I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v16i16: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu -; LMULMAX2-RV64-NEXT: vle16.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 5 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 3 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 1 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX2-RV64-NEXT: vse16.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v16i16: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV64I-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 5 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 3 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 1 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 8 +; LMULMAX2-RV64I-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v16i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle16.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle16.v v9, (a0) -; LMULMAX1-RV32-NEXT: addi a6, zero, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a6 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a3, 5 -; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a4, 3 -; LMULMAX1-RV32-NEXT: addi a4, a4, 819 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a5, 1 -; LMULMAX1-RV32-NEXT: addi a5, a5, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: addi a2, zero, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v9, a6 -; LMULMAX1-RV32-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 8 -; LMULMAX1-RV32-NEXT: vse16.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse16.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: cttz_v16i16: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV32D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32D-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32D-NEXT: vxor.vi v12, v8, -1 +; LMULMAX2-RV32D-NEXT: vand.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV32D-NEXT: lui a1, 5 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 3 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 1 +; LMULMAX2-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 257 +; LMULMAX2-RV32D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v16i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle16.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle16.v v9, (a0) -; LMULMAX1-RV64-NEXT: addi a6, zero, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a6 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: lui a3, 5 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a4, 3 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a5, 1 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: addi a2, zero, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 8 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a6 -; LMULMAX1-RV64-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 8 -; LMULMAX1-RV64-NEXT: vse16.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse16.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: cttz_v16i16: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX2-RV64D-NEXT: vle16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64D-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64D-NEXT: vxor.vi v12, v8, -1 +; LMULMAX2-RV64D-NEXT: vand.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV64D-NEXT: lui a1, 5 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 3 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 1 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 257 +; LMULMAX2-RV64D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 16 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse16.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: cttz_v16i16: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV32D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32D-NEXT: vle16.v v8, (a1) +; LMULMAX1-RV32D-NEXT: vle16.v v9, (a0) +; LMULMAX1-RV32D-NEXT: addi a6, zero, 1 +; LMULMAX1-RV32D-NEXT: vsub.vx v10, v8, a6 +; LMULMAX1-RV32D-NEXT: vxor.vi v11, v8, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: lui a3, 5 +; LMULMAX1-RV32D-NEXT: addi a3, a3, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a4, 3 +; LMULMAX1-RV32D-NEXT: addi a4, a4, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a5, 1 +; LMULMAX1-RV32D-NEXT: addi a5, a5, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: addi a2, zero, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a7, zero, 16 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v10, a7, v0 +; LMULMAX1-RV32D-NEXT: vsub.vx v10, v9, a6 +; LMULMAX1-RV32D-NEXT: vxor.vi v11, v9, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v10, 8 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v9, v9, a7, v0 +; LMULMAX1-RV32D-NEXT: vse16.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vse16.v v8, (a1) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: cttz_v16i16: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; LMULMAX1-RV64D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64D-NEXT: vle16.v v8, (a1) +; LMULMAX1-RV64D-NEXT: vle16.v v9, (a0) +; LMULMAX1-RV64D-NEXT: addi a6, zero, 1 +; LMULMAX1-RV64D-NEXT: vsub.vx v10, v8, a6 +; LMULMAX1-RV64D-NEXT: vxor.vi v11, v8, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: lui a3, 5 +; LMULMAX1-RV64D-NEXT: addiw a3, a3, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a4, 3 +; LMULMAX1-RV64D-NEXT: addiw a4, a4, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a5, 1 +; LMULMAX1-RV64D-NEXT: addiw a5, a5, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: addi a2, zero, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 8 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a7, zero, 16 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v10, a7, v0 +; LMULMAX1-RV64D-NEXT: vsub.vx v10, v9, a6 +; LMULMAX1-RV64D-NEXT: vxor.vi v11, v9, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v10, 8 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v9, v9, a7, v0 +; LMULMAX1-RV64D-NEXT: vse16.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vse16.v v8, (a1) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v16i16: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV32-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV32-NEXT: addi a1, zero, 127 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 16 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v16i16: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 16, e16, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle16.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV64-NEXT: vnsrl.wi v10, v12, 23 +; LMULMAX8-RV64-NEXT: addi a1, zero, 127 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 16 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse16.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y %c = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 false) @@ -873,165 +1490,277 @@ declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>, i1) define void @cttz_v8i32(<8 x i32>* %x, <8 x i32>* %y) nounwind { -; LMULMAX2-RV32-LABEL: cttz_v8i32: -; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV32-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV32-NEXT: addi a1, zero, 1 -; LMULMAX2-RV32-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV32-NEXT: lui a1, 349525 -; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 -; LMULMAX2-RV32-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 209715 -; LMULMAX2-RV32-NEXT: addi a1, a1, 819 -; LMULMAX2-RV32-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV32-NEXT: lui a1, 61681 -; LMULMAX2-RV32-NEXT: addi a1, a1, -241 -; LMULMAX2-RV32-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: lui a1, 4112 -; LMULMAX2-RV32-NEXT: addi a1, a1, 257 -; LMULMAX2-RV32-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV32-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV32-NEXT: ret +; LMULMAX2-RV32I-LABEL: cttz_v8i32: +; LMULMAX2-RV32I: # %bb.0: +; LMULMAX2-RV32I-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV32I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV32I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV32I-NEXT: lui a1, 349525 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 209715 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV32I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV32I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV32I-NEXT: lui a1, 61681 +; LMULMAX2-RV32I-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: lui a1, 4112 +; LMULMAX2-RV32I-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV32I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV32I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32I-NEXT: ret ; -; LMULMAX2-RV64-LABEL: cttz_v8i32: -; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: addi a1, zero, 1 -; LMULMAX2-RV64-NEXT: vsub.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX2-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX2-RV64-NEXT: lui a1, 349525 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX2-RV64-NEXT: vand.vx v10, v10, a1 -; LMULMAX2-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 209715 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 -; LMULMAX2-RV64-NEXT: vand.vx v10, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX2-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX2-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX2-RV64-NEXT: lui a1, 61681 -; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 -; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: lui a1, 4112 -; LMULMAX2-RV64-NEXT: addiw a1, a1, 257 -; LMULMAX2-RV64-NEXT: vmul.vx v8, v8, a1 -; LMULMAX2-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX2-RV64-NEXT: vse32.v v8, (a0) -; LMULMAX2-RV64-NEXT: ret +; LMULMAX2-RV64I-LABEL: cttz_v8i32: +; LMULMAX2-RV64I: # %bb.0: +; LMULMAX2-RV64I-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV64I-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64I-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vxor.vi v8, v8, -1 +; LMULMAX2-RV64I-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX2-RV64I-NEXT: lui a1, 349525 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64I-NEXT: vsub.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 209715 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64I-NEXT: vand.vx v10, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v10, v8 +; LMULMAX2-RV64I-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX2-RV64I-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-RV64I-NEXT: lui a1, 61681 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64I-NEXT: vand.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: lui a1, 4112 +; LMULMAX2-RV64I-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64I-NEXT: vmul.vx v8, v8, a1 +; LMULMAX2-RV64I-NEXT: vsrl.vi v8, v8, 24 +; LMULMAX2-RV64I-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64I-NEXT: ret ; -; LMULMAX1-RV32-LABEL: cttz_v8i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV32-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV32-NEXT: addi a6, zero, 1 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v8, a6 -; LMULMAX1-RV32-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV32-NEXT: lui a3, 349525 -; LMULMAX1-RV32-NEXT: addi a3, a3, 1365 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a4, 209715 -; LMULMAX1-RV32-NEXT: addi a4, a4, 819 -; LMULMAX1-RV32-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV32-NEXT: lui a5, 61681 -; LMULMAX1-RV32-NEXT: addi a5, a5, -241 -; LMULMAX1-RV32-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV32-NEXT: lui a2, 4112 -; LMULMAX1-RV32-NEXT: addi a2, a2, 257 -; LMULMAX1-RV32-NEXT: vmul.vx v8, v8, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV32-NEXT: vsub.vx v10, v9, a6 -; LMULMAX1-RV32-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV32-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV32-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV32-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV32-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV32-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV32-NEXT: vmul.vx v9, v9, a2 -; LMULMAX1-RV32-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV32-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV32-NEXT: ret +; LMULMAX2-RV32D-LABEL: cttz_v8i32: +; LMULMAX2-RV32D: # %bb.0: +; LMULMAX2-RV32D-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV32D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV32D-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV32D-NEXT: vxor.vi v12, v8, -1 +; LMULMAX2-RV32D-NEXT: vand.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV32D-NEXT: lui a1, 349525 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV32D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 209715 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV32D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV32D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV32D-NEXT: lui a1, 61681 +; LMULMAX2-RV32D-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: lui a1, 4112 +; LMULMAX2-RV32D-NEXT: addi a1, a1, 257 +; LMULMAX2-RV32D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV32D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX2-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV32D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV32D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV32D-NEXT: ret ; -; LMULMAX1-RV64-LABEL: cttz_v8i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle32.v v8, (a1) -; LMULMAX1-RV64-NEXT: vle32.v v9, (a0) -; LMULMAX1-RV64-NEXT: addi a6, zero, 1 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v8, a6 -; LMULMAX1-RV64-NEXT: vxor.vi v8, v8, -1 -; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 1 -; LMULMAX1-RV64-NEXT: lui a3, 349525 -; LMULMAX1-RV64-NEXT: addiw a3, a3, 1365 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a4, 209715 -; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 -; LMULMAX1-RV64-NEXT: vand.vx v10, v8, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 2 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v10, v8 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v8, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v8, v8, v10 -; LMULMAX1-RV64-NEXT: lui a5, 61681 -; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 -; LMULMAX1-RV64-NEXT: vand.vx v8, v8, a5 -; LMULMAX1-RV64-NEXT: lui a2, 4112 -; LMULMAX1-RV64-NEXT: addiw a2, a2, 257 -; LMULMAX1-RV64-NEXT: vmul.vx v8, v8, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v8, v8, 24 -; LMULMAX1-RV64-NEXT: vsub.vx v10, v9, a6 -; LMULMAX1-RV64-NEXT: vxor.vi v9, v9, -1 -; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 1 -; LMULMAX1-RV64-NEXT: vand.vx v10, v10, a3 -; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v10, v9, a4 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 2 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v10, v9 -; LMULMAX1-RV64-NEXT: vsrl.vi v10, v9, 4 -; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v10 -; LMULMAX1-RV64-NEXT: vand.vx v9, v9, a5 -; LMULMAX1-RV64-NEXT: vmul.vx v9, v9, a2 -; LMULMAX1-RV64-NEXT: vsrl.vi v9, v9, 24 -; LMULMAX1-RV64-NEXT: vse32.v v9, (a0) -; LMULMAX1-RV64-NEXT: vse32.v v8, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX2-RV64D-LABEL: cttz_v8i32: +; LMULMAX2-RV64D: # %bb.0: +; LMULMAX2-RV64D-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX2-RV64D-NEXT: vle32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: addi a1, zero, 1 +; LMULMAX2-RV64D-NEXT: vsub.vx v10, v8, a1 +; LMULMAX2-RV64D-NEXT: vxor.vi v12, v8, -1 +; LMULMAX2-RV64D-NEXT: vand.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 1 +; LMULMAX2-RV64D-NEXT: lui a1, 349525 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v12, a1 +; LMULMAX2-RV64D-NEXT: vsub.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 209715 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64D-NEXT: vand.vx v12, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v12, v10 +; LMULMAX2-RV64D-NEXT: vsrl.vi v12, v10, 4 +; LMULMAX2-RV64D-NEXT: vadd.vv v10, v10, v12 +; LMULMAX2-RV64D-NEXT: lui a1, 61681 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64D-NEXT: vand.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: lui a1, 4112 +; LMULMAX2-RV64D-NEXT: addiw a1, a1, 257 +; LMULMAX2-RV64D-NEXT: vmul.vx v10, v10, a1 +; LMULMAX2-RV64D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX2-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-RV64D-NEXT: addi a1, zero, 32 +; LMULMAX2-RV64D-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX2-RV64D-NEXT: vse32.v v8, (a0) +; LMULMAX2-RV64D-NEXT: ret +; +; LMULMAX1-RV32D-LABEL: cttz_v8i32: +; LMULMAX1-RV32D: # %bb.0: +; LMULMAX1-RV32D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV32D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32D-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV32D-NEXT: vle32.v v9, (a0) +; LMULMAX1-RV32D-NEXT: addi a6, zero, 1 +; LMULMAX1-RV32D-NEXT: vsub.vx v10, v8, a6 +; LMULMAX1-RV32D-NEXT: vxor.vi v11, v8, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: lui a3, 349525 +; LMULMAX1-RV32D-NEXT: addi a3, a3, 1365 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a4, 209715 +; LMULMAX1-RV32D-NEXT: addi a4, a4, 819 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: lui a5, 61681 +; LMULMAX1-RV32D-NEXT: addi a5, a5, -241 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: lui a2, 4112 +; LMULMAX1-RV32D-NEXT: addi a2, a2, 257 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV32D-NEXT: addi a7, zero, 32 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v8, v10, a7, v0 +; LMULMAX1-RV32D-NEXT: vsub.vx v10, v9, a6 +; LMULMAX1-RV32D-NEXT: vxor.vi v11, v9, -1 +; LMULMAX1-RV32D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV32D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV32D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV32D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV32D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV32D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV32D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV32D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV32D-NEXT: vsrl.vi v9, v10, 24 +; LMULMAX1-RV32D-NEXT: vmerge.vxm v9, v9, a7, v0 +; LMULMAX1-RV32D-NEXT: vse32.v v9, (a0) +; LMULMAX1-RV32D-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV32D-NEXT: ret +; +; LMULMAX1-RV64D-LABEL: cttz_v8i32: +; LMULMAX1-RV64D: # %bb.0: +; LMULMAX1-RV64D-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; LMULMAX1-RV64D-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64D-NEXT: vle32.v v8, (a1) +; LMULMAX1-RV64D-NEXT: vle32.v v9, (a0) +; LMULMAX1-RV64D-NEXT: addi a6, zero, 1 +; LMULMAX1-RV64D-NEXT: vsub.vx v10, v8, a6 +; LMULMAX1-RV64D-NEXT: vxor.vi v11, v8, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: lui a3, 349525 +; LMULMAX1-RV64D-NEXT: addiw a3, a3, 1365 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a4, 209715 +; LMULMAX1-RV64D-NEXT: addiw a4, a4, 819 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: lui a5, 61681 +; LMULMAX1-RV64D-NEXT: addiw a5, a5, -241 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: lui a2, 4112 +; LMULMAX1-RV64D-NEXT: addiw a2, a2, 257 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 24 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64D-NEXT: addi a7, zero, 32 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v8, v10, a7, v0 +; LMULMAX1-RV64D-NEXT: vsub.vx v10, v9, a6 +; LMULMAX1-RV64D-NEXT: vxor.vi v11, v9, -1 +; LMULMAX1-RV64D-NEXT: vand.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 1 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v11, a3 +; LMULMAX1-RV64D-NEXT: vsub.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v11, v10, a4 +; LMULMAX1-RV64D-NEXT: vsrl.vi v10, v10, 2 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v11, v10 +; LMULMAX1-RV64D-NEXT: vsrl.vi v11, v10, 4 +; LMULMAX1-RV64D-NEXT: vadd.vv v10, v10, v11 +; LMULMAX1-RV64D-NEXT: vand.vx v10, v10, a5 +; LMULMAX1-RV64D-NEXT: vmul.vx v10, v10, a2 +; LMULMAX1-RV64D-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64D-NEXT: vsrl.vi v9, v10, 24 +; LMULMAX1-RV64D-NEXT: vmerge.vxm v9, v9, a7, v0 +; LMULMAX1-RV64D-NEXT: vse32.v v9, (a0) +; LMULMAX1-RV64D-NEXT: vse32.v v8, (a1) +; LMULMAX1-RV64D-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v8i32: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV32-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV32-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV32-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 52 +; LMULMAX8-RV32-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV32-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV32-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV32-NEXT: addi a1, zero, 32 +; LMULMAX8-RV32-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV32-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v8i32: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle32.v v8, (a0) +; LMULMAX8-RV64-NEXT: vrsub.vi v10, v8, 0 +; LMULMAX8-RV64-NEXT: vand.vv v10, v8, v10 +; LMULMAX8-RV64-NEXT: vfwcvt.f.xu.v v12, v10 +; LMULMAX8-RV64-NEXT: addi a1, zero, 52 +; LMULMAX8-RV64-NEXT: vnsrl.wx v10, v12, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 1023 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX8-RV64-NEXT: addi a1, zero, 32 +; LMULMAX8-RV64-NEXT: vmerge.vxm v8, v10, a1, v0 +; LMULMAX8-RV64-NEXT: vse32.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y %c = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 false) @@ -1278,6 +2007,107 @@ ; LMULMAX1-RV64-NEXT: vse64.v v9, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v8, (a7) ; LMULMAX1-RV64-NEXT: ret +; +; LMULMAX8-RV32-LABEL: cttz_v4i64: +; LMULMAX8-RV32: # %bb.0: +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV32-NEXT: addi a1, zero, 1 +; LMULMAX8-RV32-NEXT: vsub.vx v10, v8, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.i v12, -1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vxor.vv v8, v8, v12 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV32-NEXT: lui a1, 349525 +; LMULMAX8-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v12, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v10, v10, v12 +; LMULMAX8-RV32-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 209715 +; LMULMAX8-RV32-NEXT: addi a1, a1, 819 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v12, v8, v10 +; LMULMAX8-RV32-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v12, v8 +; LMULMAX8-RV32-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV32-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 61681 +; LMULMAX8-RV32-NEXT: addi a1, a1, -241 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: lui a1, 4112 +; LMULMAX8-RV32-NEXT: addi a1, a1, 257 +; LMULMAX8-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmv.v.x v10, a1 +; LMULMAX8-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV32-NEXT: vmul.vv v8, v8, v10 +; LMULMAX8-RV32-NEXT: addi a1, zero, 56 +; LMULMAX8-RV32-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV32-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV32-NEXT: ret +; +; LMULMAX8-RV64-LABEL: cttz_v4i64: +; LMULMAX8-RV64: # %bb.0: +; LMULMAX8-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; LMULMAX8-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX8-RV64-NEXT: addi a1, zero, 1 +; LMULMAX8-RV64-NEXT: vsub.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vxor.vi v8, v8, -1 +; LMULMAX8-RV64-NEXT: vand.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 1 +; LMULMAX8-RV64-NEXT: lui a1, 21845 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX8-RV64-NEXT: vand.vx v10, v10, a1 +; LMULMAX8-RV64-NEXT: vsub.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 13107 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 819 +; LMULMAX8-RV64-NEXT: vand.vx v10, v8, a1 +; LMULMAX8-RV64-NEXT: vsrl.vi v8, v8, 2 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v10, v8 +; LMULMAX8-RV64-NEXT: vsrl.vi v10, v8, 4 +; LMULMAX8-RV64-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-RV64-NEXT: lui a1, 3855 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, 241 +; LMULMAX8-RV64-NEXT: slli a1, a1, 12 +; LMULMAX8-RV64-NEXT: addi a1, a1, -241 +; LMULMAX8-RV64-NEXT: vand.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: lui a1, 4112 +; LMULMAX8-RV64-NEXT: addiw a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: slli a1, a1, 16 +; LMULMAX8-RV64-NEXT: addi a1, a1, 257 +; LMULMAX8-RV64-NEXT: vmul.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: addi a1, zero, 56 +; LMULMAX8-RV64-NEXT: vsrl.vx v8, v8, a1 +; LMULMAX8-RV64-NEXT: vse64.v v8, (a0) +; LMULMAX8-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y %c = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 false)