diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -651,6 +651,11 @@ return false; } + /// Return true if ctpop instruction is fast. + virtual bool isCtpopFast(EVT VT) const { + return isOperationLegal(ISD::CTPOP, VT); + } + /// Return the maximum number of "x & (x - 1)" operations that can be done /// instead of deferring to a custom CTPOP. virtual unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -4168,8 +4168,8 @@ // (ctpop x) u< 2 -> (x & x-1) == 0 // (ctpop x) u> 1 -> (x & x-1) != 0 if (Cond == ISD::SETULT || Cond == ISD::SETUGT) { - // Keep the CTPOP if it is a legal vector op. - if (CTVT.isVector() && TLI.isOperationLegal(ISD::CTPOP, CTVT)) + // Keep the CTPOP if it is a cheap vector op. + if (CTVT.isVector() && TLI.isCtpopFast(CTVT)) return SDValue(); unsigned CostLimit = TLI.getCustomCtpopCost(CTVT, Cond); @@ -4194,8 +4194,8 @@ // (ctpop x) == 1 --> (x != 0) && ((x & x-1) == 0) // (ctpop x) != 1 --> (x == 0) || ((x & x-1) != 0) if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && C1 == 1) { - // Keep the CTPOP if it is legal. - if (TLI.isOperationLegal(ISD::CTPOP, CTVT)) + // Keep the CTPOP if it is cheap. + if (TLI.isCtpopFast(CTVT)) return SDValue(); SDValue Zero = DAG.getConstant(0, dl, CTVT); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -607,6 +607,10 @@ } bool convertSelectOfConstantsToMath(EVT VT) const override { return true; } + bool isCtpopFast(EVT VT) const override; + + unsigned getCustomCtpopCost(EVT VT, ISD::CondCode Cond) const override; + bool preferZeroCompareBranch() const override { return true; } bool shouldInsertFencesForAtomic(const Instruction *I) const override { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18585,6 +18585,20 @@ return getTargetMMOFlags(NodeX) == getTargetMMOFlags(NodeY); } +bool RISCVTargetLowering::isCtpopFast(EVT VT) const { + if (VT.isScalableVector()) + return isTypeLegal(VT) && Subtarget.hasStdExtZvbb(); + if (VT.isFixedLengthVector() && Subtarget.hasStdExtZvbb()) + return true; + return Subtarget.hasStdExtZbb() && + (VT == MVT::i32 || VT == MVT::i64 || VT.isFixedLengthVector()); +} + +unsigned RISCVTargetLowering::getCustomCtpopCost(EVT VT, + ISD::CondCode Cond) const { + return isCtpopFast(VT) ? 0 : 1; +} + namespace llvm::RISCVVIntrinsicsTable { #define GET_RISCVVIntrinsicsTable_IMPL diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -309,6 +309,259 @@ ret i32 %1 } +define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind { +; RV32I-LABEL: ctpop_i32_ult_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i32_ult_two: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp ult i32 %1, 2 + ret i1 %2 +} + +define i1 @ctpop_i32_ugt_one(i32 signext %a) nounwind { +; RV32I-LABEL: ctpop_i32_ugt_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i32_ugt_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 +; RV32ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp ugt i32 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i32_eq_one(i32 signext %a) nounwind { +; RV32I-LABEL: ctpop_i32_eq_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i32_eq_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: seqz a0, a0 +; RV32ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp eq i32 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind { +; RV32I-LABEL: ctpop_i32_ne_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a1, a0, -1 +; RV32I-NEXT: and a1, a0, a1 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i32_ne_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: snez a0, a0 +; RV32ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp ne i32 %1, 1 + ret i1 %2 +} + +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) + +define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { +; RV32I-LABEL: ctpop_v2i32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -32 +; RV32I-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi s3, a2, 1365 +; RV32I-NEXT: and a1, a1, s3 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: lui a1, 209715 +; RV32I-NEXT: addi s4, a1, 819 +; RV32I-NEXT: and a1, a0, s4 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, s4 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lui a1, 61681 +; RV32I-NEXT: addi s5, a1, -241 +; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: lui a1, 4112 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __mulsi3@plt +; RV32I-NEXT: srli s2, a0, 24 +; RV32I-NEXT: srli a0, s0, 1 +; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: sub s0, s0, a0 +; RV32I-NEXT: and a0, s0, s4 +; RV32I-NEXT: srli s0, s0, 2 +; RV32I-NEXT: and a1, s0, s4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: and a0, a0, s5 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __mulsi3@plt +; RV32I-NEXT: srli a1, a0, 24 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 32 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + ret <2 x i32> %1 +} + +define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind { +; RV32I-LABEL: ctpop_v2i32_ult_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i32_ult_two: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp ult <2 x i32> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { +; RV32I-LABEL: ctpop_v2i32_ugt_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i32_ugt_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 +; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: xori a1, a1, 1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp ugt <2 x i32> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind { +; RV32I-LABEL: ctpop_v2i32_eq_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: seqz a2, a2 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: seqz a2, a2 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: and a1, a1, a2 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i32_eq_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: seqz a0, a0 +; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: seqz a1, a1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp eq <2 x i32> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { +; RV32I-LABEL: ctpop_v2i32_ne_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: snez a2, a2 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: addi a2, a1, -1 +; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: snez a2, a2 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i32_ne_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: snez a0, a0 +; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp ne <2 x i32> %1, + ret <2 x i1> %2 +} + declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { @@ -380,6 +633,422 @@ ret i64 %1 } +define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { +; RV32I-LABEL: ctpop_i64_ugt_two: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i64_ugt_two: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp ult i64 %1, 2 + ret i1 %2 +} + +define i1 @ctpop_i64_ugt_one(i64 %a) nounwind { +; RV32I-LABEL: ctpop_i64_ugt_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: and a0, a1, a0 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i64_ugt_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 +; RV32ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp ugt i64 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i64_eq_one(i64 %a) nounwind { +; RV32I-LABEL: ctpop_i64_eq_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: seqz a3, a0 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: and a3, a1, a3 +; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: seqz a2, a2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i64_eq_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: seqz a0, a0 +; RV32ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp eq i64 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i64_ne_one(i64 %a) nounwind { +; RV32I-LABEL: ctpop_i64_ne_one: +; RV32I: # %bb.0: +; RV32I-NEXT: addi a2, a0, -1 +; RV32I-NEXT: and a2, a0, a2 +; RV32I-NEXT: seqz a3, a0 +; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: and a3, a1, a3 +; RV32I-NEXT: or a2, a2, a3 +; RV32I-NEXT: snez a2, a2 +; RV32I-NEXT: or a0, a0, a1 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_i64_ne_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: add a0, a0, a1 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: snez a0, a0 +; RV32ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp ne i64 %1, 1 + ret i1 %2 +} + +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) + +define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { +; RV32I-LABEL: ctpop_v2i64: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -48 +; RV32I-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 40(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 36(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 32(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s3, 28(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s4, 24(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s5, 20(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s6, 16(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s7, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s8, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lw a0, 4(a1) +; RV32I-NEXT: lw s2, 8(a1) +; RV32I-NEXT: lw s5, 12(a1) +; RV32I-NEXT: lw s6, 0(a1) +; RV32I-NEXT: srli a1, a0, 1 +; RV32I-NEXT: lui a2, 349525 +; RV32I-NEXT: addi s3, a2, 1365 +; RV32I-NEXT: and a1, a1, s3 +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: lui a1, 209715 +; RV32I-NEXT: addi s4, a1, 819 +; RV32I-NEXT: and a1, a0, s4 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, s4 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: lui a1, 61681 +; RV32I-NEXT: addi s7, a1, -241 +; RV32I-NEXT: and a0, a0, s7 +; RV32I-NEXT: lui a1, 4112 +; RV32I-NEXT: addi s1, a1, 257 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __mulsi3@plt +; RV32I-NEXT: srli s8, a0, 24 +; RV32I-NEXT: srli a0, s6, 1 +; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: sub a0, s6, a0 +; RV32I-NEXT: and a1, a0, s4 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, s4 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: and a0, a0, s7 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __mulsi3@plt +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: add s8, a0, s8 +; RV32I-NEXT: srli a0, s5, 1 +; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: sub a0, s5, a0 +; RV32I-NEXT: and a1, a0, s4 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, s4 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: and a0, a0, s7 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __mulsi3@plt +; RV32I-NEXT: srli s5, a0, 24 +; RV32I-NEXT: srli a0, s2, 1 +; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: sub a0, s2, a0 +; RV32I-NEXT: and a1, a0, s4 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: and a0, a0, s4 +; RV32I-NEXT: add a0, a1, a0 +; RV32I-NEXT: srli a1, a0, 4 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: and a0, a0, s7 +; RV32I-NEXT: mv a1, s1 +; RV32I-NEXT: call __mulsi3@plt +; RV32I-NEXT: srli a0, a0, 24 +; RV32I-NEXT: add a0, a0, s5 +; RV32I-NEXT: sw zero, 12(s0) +; RV32I-NEXT: sw zero, 4(s0) +; RV32I-NEXT: sw a0, 8(s0) +; RV32I-NEXT: sw s8, 0(s0) +; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 36(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 32(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s3, 28(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s4, 24(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s5, 20(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s6, 16(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s7, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s8, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 48 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a2, 4(a1) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a1, 12(a1) +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: cpop a3, a3 +; RV32ZBB-NEXT: add a2, a3, a2 +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a3, a4 +; RV32ZBB-NEXT: add a1, a3, a1 +; RV32ZBB-NEXT: sw zero, 12(a0) +; RV32ZBB-NEXT: sw zero, 4(a0) +; RV32ZBB-NEXT: sw a1, 8(a0) +; RV32ZBB-NEXT: sw a2, 0(a0) +; RV32ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + ret <2 x i64> %1 +} + +define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { +; RV32I-LABEL: ctpop_v2i64_ult_two: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 12(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: sub a1, a0, a1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: addi a1, a3, -1 +; RV32I-NEXT: and a1, a3, a1 +; RV32I-NEXT: seqz a3, a3 +; RV32I-NEXT: sub a3, a2, a3 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i64_ult_two: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a1, 12(a0) +; RV32ZBB-NEXT: lw a2, 8(a0) +; RV32ZBB-NEXT: lw a3, 0(a0) +; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: cpop a2, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp ult <2 x i64> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { +; RV32I-LABEL: ctpop_v2i64_ugt_one: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 12(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: sub a1, a0, a1 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: or a0, a4, a0 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: addi a1, a3, -1 +; RV32I-NEXT: and a1, a3, a1 +; RV32I-NEXT: seqz a3, a3 +; RV32I-NEXT: sub a3, a2, a3 +; RV32I-NEXT: and a2, a2, a3 +; RV32I-NEXT: or a1, a1, a2 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i64_ugt_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a1, 12(a0) +; RV32ZBB-NEXT: lw a2, 8(a0) +; RV32ZBB-NEXT: lw a3, 0(a0) +; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: cpop a2, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: sltiu a0, a0, 2 +; RV32ZBB-NEXT: xori a0, a0, 1 +; RV32ZBB-NEXT: sltiu a1, a1, 2 +; RV32ZBB-NEXT: xori a1, a1, 1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp ugt <2 x i64> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { +; RV32I-LABEL: ctpop_v2i64_eq_one: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 12(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: seqz a5, a1 +; RV32I-NEXT: sub a5, a0, a5 +; RV32I-NEXT: and a5, a0, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: seqz a4, a4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: snez a0, a0 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: addi a1, a3, -1 +; RV32I-NEXT: and a1, a3, a1 +; RV32I-NEXT: seqz a4, a3 +; RV32I-NEXT: sub a4, a2, a4 +; RV32I-NEXT: and a4, a2, a4 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: seqz a1, a1 +; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: snez a2, a2 +; RV32I-NEXT: and a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i64_eq_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a1, 12(a0) +; RV32ZBB-NEXT: lw a2, 8(a0) +; RV32ZBB-NEXT: lw a3, 0(a0) +; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: cpop a2, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: seqz a0, a0 +; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: seqz a1, a1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp eq <2 x i64> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { +; RV32I-LABEL: ctpop_v2i64_ne_one: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 12(a0) +; RV32I-NEXT: lw a3, 8(a0) +; RV32I-NEXT: lw a0, 4(a0) +; RV32I-NEXT: addi a4, a1, -1 +; RV32I-NEXT: and a4, a1, a4 +; RV32I-NEXT: seqz a5, a1 +; RV32I-NEXT: sub a5, a0, a5 +; RV32I-NEXT: and a5, a0, a5 +; RV32I-NEXT: or a4, a4, a5 +; RV32I-NEXT: snez a4, a4 +; RV32I-NEXT: or a0, a1, a0 +; RV32I-NEXT: seqz a0, a0 +; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: addi a1, a3, -1 +; RV32I-NEXT: and a1, a3, a1 +; RV32I-NEXT: seqz a4, a3 +; RV32I-NEXT: sub a4, a2, a4 +; RV32I-NEXT: and a4, a2, a4 +; RV32I-NEXT: or a1, a1, a4 +; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: or a2, a3, a2 +; RV32I-NEXT: seqz a2, a2 +; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: ctpop_v2i64_ne_one: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a1, 12(a0) +; RV32ZBB-NEXT: lw a2, 8(a0) +; RV32ZBB-NEXT: lw a3, 0(a0) +; RV32ZBB-NEXT: lw a0, 4(a0) +; RV32ZBB-NEXT: cpop a1, a1 +; RV32ZBB-NEXT: cpop a2, a2 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: cpop a0, a0 +; RV32ZBB-NEXT: cpop a2, a3 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: addi a0, a0, -1 +; RV32ZBB-NEXT: snez a0, a0 +; RV32ZBB-NEXT: addi a1, a1, -1 +; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp ne <2 x i64> %1, + ret <2 x i1> %2 +} + define i32 @sextb_i32(i32 %a) nounwind { ; RV32I-LABEL: sextb_i32: ; RV32I: # %bb.0: @@ -451,10 +1120,10 @@ define i32 @min_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: min_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: blt a0, a1, .LBB10_2 +; RV32I-NEXT: blt a0, a1, .LBB28_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: .LBB10_2: +; RV32I-NEXT: .LBB28_2: ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: min_i32: @@ -474,18 +1143,18 @@ define i64 @min_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: min_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: beq a1, a3, .LBB11_2 +; CHECK-NEXT: beq a1, a3, .LBB29_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: slt a4, a1, a3 -; CHECK-NEXT: beqz a4, .LBB11_3 -; CHECK-NEXT: j .LBB11_4 -; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: beqz a4, .LBB29_3 +; CHECK-NEXT: j .LBB29_4 +; CHECK-NEXT: .LBB29_2: ; CHECK-NEXT: sltu a4, a0, a2 -; CHECK-NEXT: bnez a4, .LBB11_4 -; CHECK-NEXT: .LBB11_3: +; CHECK-NEXT: bnez a4, .LBB29_4 +; CHECK-NEXT: .LBB29_3: ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: .LBB29_4: ; CHECK-NEXT: ret %cmp = icmp slt i64 %a, %b %cond = select i1 %cmp, i64 %a, i64 %b @@ -495,10 +1164,10 @@ define i32 @max_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: max_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: blt a1, a0, .LBB12_2 +; RV32I-NEXT: blt a1, a0, .LBB30_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: .LBB12_2: +; RV32I-NEXT: .LBB30_2: ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: max_i32: @@ -518,18 +1187,18 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: max_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: beq a1, a3, .LBB13_2 +; CHECK-NEXT: beq a1, a3, .LBB31_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: slt a4, a3, a1 -; CHECK-NEXT: beqz a4, .LBB13_3 -; CHECK-NEXT: j .LBB13_4 -; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: beqz a4, .LBB31_3 +; CHECK-NEXT: j .LBB31_4 +; CHECK-NEXT: .LBB31_2: ; CHECK-NEXT: sltu a4, a2, a0 -; CHECK-NEXT: bnez a4, .LBB13_4 -; CHECK-NEXT: .LBB13_3: +; CHECK-NEXT: bnez a4, .LBB31_4 +; CHECK-NEXT: .LBB31_3: ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: .LBB13_4: +; CHECK-NEXT: .LBB31_4: ; CHECK-NEXT: ret %cmp = icmp sgt i64 %a, %b %cond = select i1 %cmp, i64 %a, i64 %b @@ -539,10 +1208,10 @@ define i32 @minu_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: minu_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: bltu a0, a1, .LBB14_2 +; RV32I-NEXT: bltu a0, a1, .LBB32_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: .LBB14_2: +; RV32I-NEXT: .LBB32_2: ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: minu_i32: @@ -562,18 +1231,18 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: minu_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: beq a1, a3, .LBB15_2 +; CHECK-NEXT: beq a1, a3, .LBB33_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: sltu a4, a1, a3 -; CHECK-NEXT: beqz a4, .LBB15_3 -; CHECK-NEXT: j .LBB15_4 -; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: beqz a4, .LBB33_3 +; CHECK-NEXT: j .LBB33_4 +; CHECK-NEXT: .LBB33_2: ; CHECK-NEXT: sltu a4, a0, a2 -; CHECK-NEXT: bnez a4, .LBB15_4 -; CHECK-NEXT: .LBB15_3: +; CHECK-NEXT: bnez a4, .LBB33_4 +; CHECK-NEXT: .LBB33_3: ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: .LBB15_4: +; CHECK-NEXT: .LBB33_4: ; CHECK-NEXT: ret %cmp = icmp ult i64 %a, %b %cond = select i1 %cmp, i64 %a, i64 %b @@ -583,10 +1252,10 @@ define i32 @maxu_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: maxu_i32: ; RV32I: # %bb.0: -; RV32I-NEXT: bltu a1, a0, .LBB16_2 +; RV32I-NEXT: bltu a1, a0, .LBB34_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: .LBB16_2: +; RV32I-NEXT: .LBB34_2: ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: maxu_i32: @@ -606,18 +1275,18 @@ define i64 @maxu_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: maxu_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: beq a1, a3, .LBB17_2 +; CHECK-NEXT: beq a1, a3, .LBB35_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: sltu a4, a3, a1 -; CHECK-NEXT: beqz a4, .LBB17_3 -; CHECK-NEXT: j .LBB17_4 -; CHECK-NEXT: .LBB17_2: +; CHECK-NEXT: beqz a4, .LBB35_3 +; CHECK-NEXT: j .LBB35_4 +; CHECK-NEXT: .LBB35_2: ; CHECK-NEXT: sltu a4, a2, a0 -; CHECK-NEXT: bnez a4, .LBB17_4 -; CHECK-NEXT: .LBB17_3: +; CHECK-NEXT: bnez a4, .LBB35_4 +; CHECK-NEXT: .LBB35_3: ; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: .LBB17_4: +; CHECK-NEXT: .LBB35_4: ; CHECK-NEXT: ret %cmp = icmp ugt i64 %a, %b %cond = select i1 %cmp, i64 %a, i64 %b @@ -648,13 +1317,13 @@ define i64 @abs_i64(i64 %x) { ; CHECK-LABEL: abs_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: bgez a1, .LBB19_2 +; CHECK-NEXT: bgez a1, .LBB37_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: snez a2, a0 ; CHECK-NEXT: neg a0, a0 ; CHECK-NEXT: neg a1, a1 ; CHECK-NEXT: sub a1, a1, a2 -; CHECK-NEXT: .LBB19_2: +; CHECK-NEXT: .LBB37_2: ; CHECK-NEXT: ret %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true) ret i64 %abs diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -578,6 +578,85 @@ ret i32 %1 } +define i1 @ctpop_i32_ult_two(i32 signext %a) nounwind { +; RV64I-LABEL: ctpop_i32_ult_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i32_ult_two: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp ult i32 %1, 2 + ret i1 %2 +} + +define i1 @ctpop_i32_ugt_one(i32 signext %a) nounwind { +; RV64I-LABEL: ctpop_i32_ugt_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i32_ugt_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 +; RV64ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp ugt i32 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i32_eq_one(i32 signext %a) nounwind { +; RV64I-LABEL: ctpop_i32_eq_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: seqz a1, a1 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i32_eq_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: seqz a0, a0 +; RV64ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp eq i32 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i32_ne_one(i32 signext %a) nounwind { +; RV64I-LABEL: ctpop_i32_ne_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addiw a1, a0, -1 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: snez a1, a1 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i32_ne_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: snez a0, a0 +; RV64ZBB-NEXT: ret + %1 = call i32 @llvm.ctpop.i32(i32 %a) + %2 = icmp ne i32 %1, 1 + ret i1 %2 +} + define signext i32 @ctpop_i32_load(ptr %p) nounwind { ; RV64I-LABEL: ctpop_i32_load: ; RV64I: # %bb.0: @@ -618,6 +697,192 @@ ret i32 %1 } +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) + +define <2 x i32> @ctpop_v2i32(<2 x i32> %a) nounwind { +; RV64I-LABEL: ctpop_v2i32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 +; RV64I-NEXT: addiw s3, a2, 1365 +; RV64I-NEXT: and a1, a1, s3 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: addiw s4, a1, 819 +; RV64I-NEXT: and a1, a0, s4 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, s4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw s5, a1, -241 +; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw s1, a1, 257 +; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: call __muldi3@plt +; RV64I-NEXT: srliw s2, a0, 24 +; RV64I-NEXT: srli a0, s0, 1 +; RV64I-NEXT: and a0, a0, s3 +; RV64I-NEXT: sub s0, s0, a0 +; RV64I-NEXT: and a0, s0, s4 +; RV64I-NEXT: srli s0, s0, 2 +; RV64I-NEXT: and a1, s0, s4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: call __muldi3@plt +; RV64I-NEXT: srliw a1, a0, 24 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: cpopw a1, a1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + ret <2 x i32> %1 +} + +define <2 x i1> @ctpop_v2i32_ult_two(<2 x i32> %a) nounwind { +; RV64I-LABEL: ctpop_v2i32_ult_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addiw a2, a0, -1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addiw a2, a1, -1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: seqz a1, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i32_ult_two: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a1, a1 +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: sltiu a1, a1, 2 +; RV64ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp ult <2 x i32> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i32_ugt_one(<2 x i32> %a) nounwind { +; RV64I-LABEL: ctpop_v2i32_ugt_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addiw a2, a0, -1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addiw a2, a1, -1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: snez a1, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i32_ugt_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a1, a1 +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 +; RV64ZBB-NEXT: sltiu a1, a1, 2 +; RV64ZBB-NEXT: xori a1, a1, 1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp ugt <2 x i32> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i32_eq_one(<2 x i32> %a) nounwind { +; RV64I-LABEL: ctpop_v2i32_eq_one: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: addiw a4, a0, -1 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: snez a3, a3 +; RV64I-NEXT: and a0, a3, a0 +; RV64I-NEXT: addiw a3, a1, -1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: seqz a1, a1 +; RV64I-NEXT: snez a2, a2 +; RV64I-NEXT: and a1, a2, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i32_eq_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a1, a1 +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: seqz a0, a0 +; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: seqz a1, a1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp eq <2 x i32> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i32_ne_one(<2 x i32> %a) nounwind { +; RV64I-LABEL: ctpop_v2i32_ne_one: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a2, a1 +; RV64I-NEXT: sext.w a3, a0 +; RV64I-NEXT: addiw a4, a0, -1 +; RV64I-NEXT: and a0, a0, a4 +; RV64I-NEXT: sext.w a0, a0 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: seqz a3, a3 +; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: addiw a3, a1, -1 +; RV64I-NEXT: and a1, a1, a3 +; RV64I-NEXT: sext.w a1, a1 +; RV64I-NEXT: snez a1, a1 +; RV64I-NEXT: seqz a2, a2 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i32_ne_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpopw a1, a1 +; RV64ZBB-NEXT: cpopw a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: snez a0, a0 +; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: snez a1, a1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %a) + %2 = icmp ne <2 x i32> %1, + ret <2 x i1> %2 +} + declare i64 @llvm.ctpop.i64(i64) define i64 @ctpop_i64(i64 %a) nounwind { @@ -665,6 +930,267 @@ ret i64 %1 } +define i1 @ctpop_i64_ugt_two(i64 %a) nounwind { +; RV64I-LABEL: ctpop_i64_ugt_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i64_ugt_two: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp ult i64 %1, 2 + ret i1 %2 +} + +define i1 @ctpop_i64_ugt_one(i64 %a) nounwind { +; RV64I-LABEL: ctpop_i64_ugt_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i64_ugt_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 +; RV64ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp ugt i64 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i64_eq_one(i64 %a) nounwind { +; RV64I-LABEL: ctpop_i64_eq_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: seqz a1, a1 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: and a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i64_eq_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: seqz a0, a0 +; RV64ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp eq i64 %1, 1 + ret i1 %2 +} + +define i1 @ctpop_i64_ne_one(i64 %a) nounwind { +; RV64I-LABEL: ctpop_i64_ne_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a1, a0, -1 +; RV64I-NEXT: and a1, a0, a1 +; RV64I-NEXT: snez a1, a1 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_i64_ne_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: snez a0, a0 +; RV64ZBB-NEXT: ret + %1 = call i64 @llvm.ctpop.i64(i64 %a) + %2 = icmp ne i64 %1, 1 + ret i1 %2 +} + +declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) + +define <2 x i64> @ctpop_v2i64(<2 x i64> %a) nounwind { +; RV64I-LABEL: ctpop_v2i64: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -64 +; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 32(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: srli a1, a0, 1 +; RV64I-NEXT: lui a2, 349525 +; RV64I-NEXT: addiw a2, a2, 1365 +; RV64I-NEXT: slli a3, a2, 32 +; RV64I-NEXT: add s3, a2, a3 +; RV64I-NEXT: and a1, a1, s3 +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: lui a1, 209715 +; RV64I-NEXT: addiw a1, a1, 819 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add s4, a1, a2 +; RV64I-NEXT: and a1, a0, s4 +; RV64I-NEXT: srli a0, a0, 2 +; RV64I-NEXT: and a0, a0, s4 +; RV64I-NEXT: add a0, a1, a0 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: lui a1, 61681 +; RV64I-NEXT: addiw a1, a1, -241 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: add s5, a1, a2 +; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: lui a1, 4112 +; RV64I-NEXT: addiw s1, a1, 257 +; RV64I-NEXT: slli a1, s1, 32 +; RV64I-NEXT: add s1, s1, a1 +; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: call __muldi3@plt +; RV64I-NEXT: srli s2, a0, 56 +; RV64I-NEXT: srli a0, s0, 1 +; RV64I-NEXT: and a0, a0, s3 +; RV64I-NEXT: sub s0, s0, a0 +; RV64I-NEXT: and a0, s0, s4 +; RV64I-NEXT: srli s0, s0, 2 +; RV64I-NEXT: and a1, s0, s4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: srli a1, a0, 4 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: and a0, a0, s5 +; RV64I-NEXT: mv a1, s1 +; RV64I-NEXT: call __muldi3@plt +; RV64I-NEXT: srli a1, a0, 56 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 32(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 64 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: cpop a1, a1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + ret <2 x i64> %1 +} + +define <2 x i1> @ctpop_v2i64_ult_two(<2 x i64> %a) nounwind { +; RV64I-LABEL: ctpop_v2i64_ult_two: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: addi a2, a1, -1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: seqz a1, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i64_ult_two: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a1, a1 +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: sltiu a1, a1, 2 +; RV64ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp ult <2 x i64> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i64_ugt_one(<2 x i64> %a) nounwind { +; RV64I-LABEL: ctpop_v2i64_ugt_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: addi a2, a1, -1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: snez a1, a1 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i64_ugt_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a1, a1 +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: sltiu a0, a0, 2 +; RV64ZBB-NEXT: xori a0, a0, 1 +; RV64ZBB-NEXT: sltiu a1, a1, 2 +; RV64ZBB-NEXT: xori a1, a1, 1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp ugt <2 x i64> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i64_eq_one(<2 x i64> %a) nounwind { +; RV64I-LABEL: ctpop_v2i64_eq_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: and a2, a0, a2 +; RV64I-NEXT: seqz a2, a2 +; RV64I-NEXT: snez a0, a0 +; RV64I-NEXT: and a0, a0, a2 +; RV64I-NEXT: addi a2, a1, -1 +; RV64I-NEXT: and a2, a1, a2 +; RV64I-NEXT: seqz a2, a2 +; RV64I-NEXT: snez a1, a1 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i64_eq_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a1, a1 +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: seqz a0, a0 +; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: seqz a1, a1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp eq <2 x i64> %1, + ret <2 x i1> %2 +} + +define <2 x i1> @ctpop_v2i64_ne_one(<2 x i64> %a) nounwind { +; RV64I-LABEL: ctpop_v2i64_ne_one: +; RV64I: # %bb.0: +; RV64I-NEXT: addi a2, a0, -1 +; RV64I-NEXT: and a2, a0, a2 +; RV64I-NEXT: snez a2, a2 +; RV64I-NEXT: seqz a0, a0 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: addi a2, a1, -1 +; RV64I-NEXT: and a2, a1, a2 +; RV64I-NEXT: snez a2, a2 +; RV64I-NEXT: seqz a1, a1 +; RV64I-NEXT: or a1, a1, a2 +; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: ctpop_v2i64_ne_one: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: cpop a1, a1 +; RV64ZBB-NEXT: cpop a0, a0 +; RV64ZBB-NEXT: addi a0, a0, -1 +; RV64ZBB-NEXT: snez a0, a0 +; RV64ZBB-NEXT: addi a1, a1, -1 +; RV64ZBB-NEXT: snez a1, a1 +; RV64ZBB-NEXT: ret + %1 = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a) + %2 = icmp ne <2 x i64> %1, + ret <2 x i1> %2 +} + define signext i32 @sextb_i32(i32 signext %a) nounwind { ; RV64I-LABEL: sextb_i32: ; RV64I: # %bb.0: @@ -732,10 +1258,10 @@ define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: min_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: blt a0, a1, .LBB18_2 +; RV64I-NEXT: blt a0, a1, .LBB36_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB18_2: +; RV64I-NEXT: .LBB36_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: min_i32: @@ -750,10 +1276,10 @@ define i64 @min_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: min_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: blt a0, a1, .LBB19_2 +; RV64I-NEXT: blt a0, a1, .LBB37_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB19_2: +; RV64I-NEXT: .LBB37_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: min_i64: @@ -768,10 +1294,10 @@ define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: max_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: blt a1, a0, .LBB20_2 +; RV64I-NEXT: blt a1, a0, .LBB38_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB20_2: +; RV64I-NEXT: .LBB38_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: max_i32: @@ -786,10 +1312,10 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: max_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: blt a1, a0, .LBB21_2 +; RV64I-NEXT: blt a1, a0, .LBB39_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB21_2: +; RV64I-NEXT: .LBB39_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: max_i64: @@ -804,10 +1330,10 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: minu_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: bltu a0, a1, .LBB22_2 +; RV64I-NEXT: bltu a0, a1, .LBB40_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB22_2: +; RV64I-NEXT: .LBB40_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: minu_i32: @@ -822,10 +1348,10 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: minu_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: bltu a0, a1, .LBB23_2 +; RV64I-NEXT: bltu a0, a1, .LBB41_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB23_2: +; RV64I-NEXT: .LBB41_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: minu_i64: @@ -840,10 +1366,10 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: maxu_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: bltu a1, a0, .LBB24_2 +; RV64I-NEXT: bltu a1, a0, .LBB42_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB24_2: +; RV64I-NEXT: .LBB42_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: maxu_i32: @@ -858,10 +1384,10 @@ define i64 @maxu_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: maxu_i64: ; RV64I: # %bb.0: -; RV64I-NEXT: bltu a1, a0, .LBB25_2 +; RV64I-NEXT: bltu a1, a0, .LBB43_2 ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: .LBB25_2: +; RV64I-NEXT: .LBB43_2: ; RV64I-NEXT: ret ; ; RV64ZBB-LABEL: maxu_i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-sdnode.ll @@ -857,6 +857,92 @@ %a = call @llvm.ctpop.nxv16i32( %va) ret %a } + +; We always emit vcpop.v for the scalable vector +define @ctpop_nxv16i32_ult_two( %va) { +; CHECK-LABEL: ctpop_nxv16i32_ult_two: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v8, v8, v16 +; CHECK-NEXT: vmseq.vi v0, v8, 0 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv16i32_ult_two: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmsleu.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv16i32( %va) + %cmp = icmp ult %a, shufflevector ( insertelement ( poison, i32 2, i64 0), poison, zeroinitializer) + ret %cmp +} + +define @ctpop_nxv16i32_ugt_one( %va) { +; CHECK-LABEL: ctpop_nxv16i32_ugt_one: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v8, v8, v16 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv16i32_ugt_one: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmsgtu.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv16i32( %va) + %cmp = icmp ugt %a, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + ret %cmp +} + +define @ctpop_nxv16i32_eq_one( %va) { +; CHECK-LABEL: ctpop_nxv16i32_eq_one: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v16, v8, v16 +; CHECK-NEXT: vmseq.vi v24, v16, 0 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vmand.mm v0, v16, v24 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv16i32_eq_one: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmseq.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv16i32( %va) + %cmp = icmp eq %a, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + ret %cmp +} + +define @ctpop_nxv16i32_ne_one( %va) { +; CHECK-LABEL: ctpop_nxv16i32_ne_one: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v16, v8, v16 +; CHECK-NEXT: vmsne.vi v24, v16, 0 +; CHECK-NEXT: vmseq.vi v16, v8, 0 +; CHECK-NEXT: vmor.mm v0, v16, v24 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv16i32_ne_one: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmsne.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv16i32( %va) + %cmp = icmp ne %a, shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) + ret %cmp +} + declare @llvm.ctpop.nxv16i32() define @ctpop_nxv1i64( %va) { @@ -1189,4 +1275,90 @@ %a = call @llvm.ctpop.nxv8i64( %va) ret %a } + +; We always emit vcpop.v for the scalable vector +define @ctpop_nxv8i64_ult_two( %va) { +; CHECK-LABEL: ctpop_nxv8i64_ult_two: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v8, v8, v16 +; CHECK-NEXT: vmseq.vi v0, v8, 0 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv8i64_ult_two: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmsleu.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv8i64( %va) + %cmp = icmp ult %a, shufflevector ( insertelement ( poison, i64 2, i64 0), poison, zeroinitializer) + ret %cmp +} + +define @ctpop_nxv8i64_ugt_one( %va) { +; CHECK-LABEL: ctpop_nxv8i64_ugt_one: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v8, v8, v16 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv8i64_ugt_one: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmsgtu.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv8i64( %va) + %cmp = icmp ugt %a, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + ret %cmp +} + +define @ctpop_nxv8i64_eq_one( %va) { +; CHECK-LABEL: ctpop_nxv8i64_eq_one: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v16, v8, v16 +; CHECK-NEXT: vmseq.vi v24, v16, 0 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vmand.mm v0, v16, v24 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv8i64_eq_one: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmseq.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv8i64( %va) + %cmp = icmp eq %a, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + ret %cmp +} + +define @ctpop_nxv8i64_ne_one( %va) { +; CHECK-LABEL: ctpop_nxv8i64_ne_one: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vadd.vi v16, v8, -1 +; CHECK-NEXT: vand.vv v16, v8, v16 +; CHECK-NEXT: vmsne.vi v24, v16, 0 +; CHECK-NEXT: vmseq.vi v16, v8, 0 +; CHECK-NEXT: vmor.mm v0, v16, v24 +; CHECK-NEXT: ret +; +; CHECK-ZVBB-LABEL: ctpop_nxv8i64_ne_one: +; CHECK-ZVBB: # %bb.0: +; CHECK-ZVBB-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-ZVBB-NEXT: vcpop.v v8, v8 +; CHECK-ZVBB-NEXT: vmsne.vi v0, v8, 1 +; CHECK-ZVBB-NEXT: ret + %a = call @llvm.ctpop.nxv8i64( %va) + %cmp = icmp ne %a, shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) + ret %cmp +} + declare @llvm.ctpop.nxv8i64() diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll @@ -824,6 +824,222 @@ store <8 x i32> %c, ptr %x ret void } +define <8 x i1> @ctpop_v8i32_ult_two(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v8i32_ult_two: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ctpop_v8i32_ult_two: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vadd.vi v10, v8, -1 +; LMULMAX1-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vmv.v.i v8, 0 +; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; LMULMAX1-NEXT: vmv.v.i v9, 0 +; LMULMAX1-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslideup.vi v8, v9, 4 +; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: ret +; +; ZVBB-LABEL: ctpop_v8i32_ult_two: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmsleu.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <8 x i32>, ptr %x + %b = load <8 x i32>, ptr %y + %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) + %cmp = icmp ult <8 x i32> %c, + ret <8 x i1> %cmp +} +define <8 x i1> @ctpop_v8i32_ugt_one(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v8i32_ugt_one: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ctpop_v8i32_ugt_one: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vadd.vi v10, v8, -1 +; LMULMAX1-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vmv.v.i v8, 0 +; LMULMAX1-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; LMULMAX1-NEXT: vmv.v.i v9, 0 +; LMULMAX1-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslideup.vi v8, v9, 4 +; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: ret +; +; ZVBB-LABEL: ctpop_v8i32_ugt_one: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmsgtu.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <8 x i32>, ptr %x + %b = load <8 x i32>, ptr %y + %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) + %cmp = icmp ugt <8 x i32> %c, + ret <8 x i1> %cmp +} +define <8 x i1> @ctpop_v8i32_eq_one(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v8i32_eq_one: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-NEXT: vmseq.vi v12, v10, 0 +; LMULMAX2-NEXT: vmsne.vi v10, v8, 0 +; LMULMAX2-NEXT: vmand.mm v0, v10, v12 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ctpop_v8i32_eq_one: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vmv.v.i v10, 0 +; LMULMAX1-NEXT: vmerge.vim v11, v10, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; LMULMAX1-NEXT: vmv.v.i v12, 0 +; LMULMAX1-NEXT: vmerge.vim v13, v12, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslideup.vi v11, v13, 4 +; LMULMAX1-NEXT: vmsne.vi v11, v11, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vadd.vi v13, v8, -1 +; LMULMAX1-NEXT: vand.vv v8, v8, v13 +; LMULMAX1-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vmerge.vim v8, v10, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; LMULMAX1-NEXT: vmerge.vim v9, v12, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslideup.vi v8, v9, 4 +; LMULMAX1-NEXT: vmsne.vi v8, v8, 0 +; LMULMAX1-NEXT: vmand.mm v0, v11, v8 +; LMULMAX1-NEXT: ret +; +; ZVBB-LABEL: ctpop_v8i32_eq_one: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmseq.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <8 x i32>, ptr %x + %b = load <8 x i32>, ptr %y + %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) + %cmp = icmp eq <8 x i32> %c, + ret <8 x i1> %cmp +} +define <8 x i1> @ctpop_v8i32_ne_one(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v8i32_ne_one: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-NEXT: vmsne.vi v12, v10, 0 +; LMULMAX2-NEXT: vmseq.vi v10, v8, 0 +; LMULMAX2-NEXT: vmor.mm v0, v10, v12 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ctpop_v8i32_ne_one: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vmv.v.i v10, 0 +; LMULMAX1-NEXT: vmerge.vim v11, v10, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; LMULMAX1-NEXT: vmv.v.i v12, 0 +; LMULMAX1-NEXT: vmerge.vim v13, v12, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslideup.vi v11, v13, 4 +; LMULMAX1-NEXT: vmsne.vi v11, v11, 0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vadd.vi v13, v8, -1 +; LMULMAX1-NEXT: vand.vv v8, v8, v13 +; LMULMAX1-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vmerge.vim v8, v10, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; LMULMAX1-NEXT: vmerge.vim v9, v12, 1, v0 +; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; LMULMAX1-NEXT: vslideup.vi v8, v9, 4 +; LMULMAX1-NEXT: vmsne.vi v8, v8, 0 +; LMULMAX1-NEXT: vmor.mm v0, v11, v8 +; LMULMAX1-NEXT: ret +; +; ZVBB-LABEL: ctpop_v8i32_ne_one: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; ZVBB-NEXT: vle32.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmsne.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <8 x i32>, ptr %x + %b = load <8 x i32>, ptr %y + %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a) + %cmp = icmp ne <8 x i32> %c, + ret <8 x i1> %cmp +} declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) define void @ctpop_v4i64(ptr %x, ptr %y) { @@ -1022,4 +1238,362 @@ store <4 x i64> %c, ptr %x ret void } +define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v4i64_ult_two: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-NEXT: vle64.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ctpop_v4i64_ult_two: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v11, v9, v10 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmseq.vv v0, v9, v11 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v10, v8, v10 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vmseq.vv v0, v8, v11 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2 +; LMULMAX1-RV32-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ctpop_v4i64_ult_two: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV64-NEXT: vadd.vi v10, v8, -1 +; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v8, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-RV64-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v9, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2 +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: ret +; +; ZVBB-LABEL: ctpop_v4i64_ult_two: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB-NEXT: vle64.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmsleu.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <4 x i64>, ptr %x + %b = load <4 x i64>, ptr %y + %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) + %cmp = icmp ult <4 x i64> %c, + ret <4 x i1> %cmp +} +define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v4i64_ugt_one: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-NEXT: vle64.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v8, v8, v10 +; LMULMAX2-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ctpop_v4i64_ugt_one: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v10, -1 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v11, v9, v10 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v11 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmsne.vv v0, v9, v11 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v9, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v10, v8, v10 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-RV32-NEXT: vmsne.vv v0, v8, v11 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v8, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2 +; LMULMAX1-RV32-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ctpop_v4i64_ugt_one: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV64-NEXT: vadd.vi v10, v8, -1 +; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v10 +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v8, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v8, v8, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v9, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v9, v9, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2 +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: ret +; +; ZVBB-LABEL: ctpop_v4i64_ugt_one: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB-NEXT: vle64.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmsgtu.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <4 x i64>, ptr %x + %b = load <4 x i64>, ptr %y + %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) + %cmp = icmp ugt <4 x i64> %c, + ret <4 x i1> %cmp +} +define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v4i64_eq_one: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-NEXT: vle64.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-NEXT: vmseq.vi v12, v10, 0 +; LMULMAX2-NEXT: vmsne.vi v10, v8, 0 +; LMULMAX2-NEXT: vmand.mm v0, v10, v12 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ctpop_v4i64_eq_one: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmsne.vv v0, v9, v10 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v12, v11, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmsne.vv v0, v8, v10 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v13, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v14, v13, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vslideup.vi v12, v14, 2 +; LMULMAX1-RV32-NEXT: vmsne.vi v12, v12, 0 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v14, -1 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v15, v9, v14 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15 +; LMULMAX1-RV32-NEXT: vmseq.vv v0, v9, v10 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vmerge.vim v9, v11, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v11, v8, v14 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vmseq.vv v0, v8, v10 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV32-NEXT: vmerge.vim v8, v13, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2 +; LMULMAX1-RV32-NEXT: vmsne.vi v8, v9, 0 +; LMULMAX1-RV32-NEXT: vmand.mm v0, v12, v8 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ctpop_v4i64_eq_one: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v10, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v11, v10, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v12, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v13, v12, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vslideup.vi v11, v13, 2 +; LMULMAX1-RV64-NEXT: vmsne.vi v11, v11, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vadd.vi v13, v8, -1 +; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v13 +; LMULMAX1-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vmerge.vim v8, v10, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-RV64-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV64-NEXT: vmerge.vim v9, v12, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2 +; LMULMAX1-RV64-NEXT: vmsne.vi v8, v8, 0 +; LMULMAX1-RV64-NEXT: vmand.mm v0, v11, v8 +; LMULMAX1-RV64-NEXT: ret +; +; ZVBB-LABEL: ctpop_v4i64_eq_one: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB-NEXT: vle64.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmseq.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <4 x i64>, ptr %x + %b = load <4 x i64>, ptr %y + %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) + %cmp = icmp eq <4 x i64> %c, + ret <4 x i1> %cmp +} +define <4 x i1> @ctpop_v4i64_ne_one(ptr %x, ptr %y) { +; LMULMAX2-LABEL: ctpop_v4i64_ne_one: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-NEXT: vle64.v v8, (a0) +; LMULMAX2-NEXT: vadd.vi v10, v8, -1 +; LMULMAX2-NEXT: vand.vv v10, v8, v10 +; LMULMAX2-NEXT: vmsne.vi v12, v10, 0 +; LMULMAX2-NEXT: vmseq.vi v10, v8, 0 +; LMULMAX2-NEXT: vmor.mm v0, v10, v12 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ctpop_v4i64_ne_one: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vle64.v v8, (a1) +; LMULMAX1-RV32-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v10, 0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmseq.vv v0, v9, v10 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v11, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v12, v11, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmseq.vv v0, v8, v10 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v13, 0 +; LMULMAX1-RV32-NEXT: vmerge.vim v14, v13, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vslideup.vi v12, v14, 2 +; LMULMAX1-RV32-NEXT: vmsne.vi v12, v12, 0 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; LMULMAX1-RV32-NEXT: vmv.v.i v14, -1 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v15, v9, v14 +; LMULMAX1-RV32-NEXT: vand.vv v9, v9, v15 +; LMULMAX1-RV32-NEXT: vmsne.vv v0, v9, v10 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vmerge.vim v9, v11, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV32-NEXT: vadd.vv v11, v8, v14 +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v11 +; LMULMAX1-RV32-NEXT: vmsne.vv v0, v8, v10 +; LMULMAX1-RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV32-NEXT: vmerge.vim v8, v13, 1, v0 +; LMULMAX1-RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV32-NEXT: vslideup.vi v9, v8, 2 +; LMULMAX1-RV32-NEXT: vmsne.vi v8, v9, 0 +; LMULMAX1-RV32-NEXT: vmor.mm v0, v12, v8 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ctpop_v4i64_ne_one: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) +; LMULMAX1-RV64-NEXT: addi a0, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v9, (a0) +; LMULMAX1-RV64-NEXT: vmseq.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v10, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v11, v10, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vmseq.vi v0, v9, 0 +; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV64-NEXT: vmv.v.i v12, 0 +; LMULMAX1-RV64-NEXT: vmerge.vim v13, v12, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vslideup.vi v11, v13, 2 +; LMULMAX1-RV64-NEXT: vmsne.vi v11, v11, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vadd.vi v13, v8, -1 +; LMULMAX1-RV64-NEXT: vand.vv v8, v8, v13 +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v8, 0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vmerge.vim v8, v10, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-RV64-NEXT: vadd.vi v10, v9, -1 +; LMULMAX1-RV64-NEXT: vand.vv v9, v9, v10 +; LMULMAX1-RV64-NEXT: vmsne.vi v0, v9, 0 +; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; LMULMAX1-RV64-NEXT: vmerge.vim v9, v12, 1, v0 +; LMULMAX1-RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; LMULMAX1-RV64-NEXT: vslideup.vi v8, v9, 2 +; LMULMAX1-RV64-NEXT: vmsne.vi v8, v8, 0 +; LMULMAX1-RV64-NEXT: vmor.mm v0, v11, v8 +; LMULMAX1-RV64-NEXT: ret +; +; ZVBB-LABEL: ctpop_v4i64_ne_one: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB-NEXT: vle64.v v8, (a0) +; ZVBB-NEXT: vcpop.v v8, v8 +; ZVBB-NEXT: vmsne.vi v0, v8, 1 +; ZVBB-NEXT: ret + %a = load <4 x i64>, ptr %x + %b = load <4 x i64>, ptr %y + %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a) + %cmp = icmp ne <4 x i64> %c, + ret <4 x i1> %cmp +} declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)