diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -15233,6 +15233,8 @@ intrinsics, such as ``llvm.bitreverse.v4i32``, operate on a per-element basis and the element order is not affected. +.. _int_bswap: + '``llvm.bswap.*``' Intrinsics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -21999,6 +22001,53 @@ %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison +.. _int_vp_bswap: + +'``llvm.vp.bswap.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.bswap.v16i32 (<16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.bswap.nxv4i32 ( , , i32 ) + declare <256 x i64> @llvm.vp.bswap.v256i64 (<256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated bswap of two vectors of integers. + + +Arguments: +"""""""""" + +The first operand and the result have the same vector of integer type. The +second operand is the vector mask and has the same number of elements as the +result vector type. The third operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.bswap``' intrinsic performs bswap (:ref:`bswap `) of the first operand on each +enabled lane. The result on disabled lanes is a :ref:`poison value `. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %a, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a) + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> poison + + .. _int_mload_mstore: Masked Vector Load and Store Intrinsics diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4918,6 +4918,11 @@ /// \returns The expansion result or SDValue() if it fails. SDValue expandBSWAP(SDNode *N, SelectionDAG &DAG) const; + /// Expand VP_BSWAP nodes. Expands VP_BSWAP nodes with + /// i16/i32/i64 scalar types. Returns SDValue() if expand fails. \param N Node + /// to expand \returns The expansion result or SDValue() if it fails. + SDValue expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const; + /// Expand BITREVERSE nodes. Expands scalar/vector BITREVERSE nodes. /// Returns SDValue() if expand fails. /// \param N Node to expand diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1558,6 +1558,10 @@ LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; + def int_vp_bswap : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; // Floating-point arithmetic def int_vp_fadd : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -215,6 +215,10 @@ BEGIN_REGISTER_VP(vp_umax, 2, 3, VP_UMAX, -1) VP_PROPERTY_BINARYOP END_REGISTER_VP(vp_umax, VP_UMAX) + +// llvm.vp.bswap(x,mask,vlen) +BEGIN_REGISTER_VP(vp_bswap, 1, 2, VP_BSWAP, -1) +END_REGISTER_VP(vp_bswap, VP_BSWAP) ///// } Integer Arithmetic ///// Floating-Point Arithmetic { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -730,6 +730,9 @@ case ISD::BSWAP: Results.push_back(ExpandBSWAP(Node)); return; + case ISD::VP_BSWAP: + Results.push_back(TLI.expandVPBSWAP(Node, DAG)); + return; case ISD::VSELECT: Results.push_back(ExpandVSELECT(Node)); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1013,6 +1013,7 @@ case ISD::ABS: case ISD::BITREVERSE: case ISD::BSWAP: + case ISD::VP_BSWAP: case ISD::CTLZ: case ISD::CTTZ: case ISD::CTLZ_ZERO_UNDEF: @@ -4084,6 +4085,7 @@ case ISD::ABS: case ISD::BITREVERSE: case ISD::BSWAP: + case ISD::VP_BSWAP: case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: case ISD::CTPOP: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8388,6 +8388,82 @@ } } +SDValue TargetLowering::expandVPBSWAP(SDNode *N, SelectionDAG &DAG) const { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Op = N->getOperand(0); + SDValue Mask = N->getOperand(1); + SDValue EVL = N->getOperand(2); + + if (!VT.isSimple()) + return SDValue(); + + EVT SHVT = getShiftAmountTy(VT, DAG.getDataLayout()); + SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8; + switch (VT.getSimpleVT().getScalarType().SimpleTy) { + default: + return SDValue(); + case MVT::i16: + Tmp1 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT), + Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT), + Mask, EVL); + return DAG.getNode(ISD::VP_OR, dl, VT, Tmp1, Tmp2, Mask, EVL); + case MVT::i32: + Tmp4 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT), + Mask, EVL); + Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Op, DAG.getConstant(0xFF00, dl, VT), + Mask, EVL); + Tmp3 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp3, DAG.getConstant(8, dl, SHVT), + Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT), + Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2, + DAG.getConstant(0xFF00, dl, VT), Mask, EVL); + Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT), + Mask, EVL); + Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp3, Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp1, Mask, EVL); + return DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp2, Mask, EVL); + case MVT::i64: + Tmp8 = DAG.getNode(ISD::VP_SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT), + Mask, EVL); + Tmp7 = DAG.getNode(ISD::VP_AND, dl, VT, Op, + DAG.getConstant(255ULL << 8, dl, VT), Mask, EVL); + Tmp7 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp7, DAG.getConstant(40, dl, SHVT), + Mask, EVL); + Tmp6 = DAG.getNode(ISD::VP_AND, dl, VT, Op, + DAG.getConstant(255ULL << 16, dl, VT), Mask, EVL); + Tmp6 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp6, DAG.getConstant(24, dl, SHVT), + Mask, EVL); + Tmp5 = DAG.getNode(ISD::VP_AND, dl, VT, Op, + DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL); + Tmp5 = DAG.getNode(ISD::VP_SHL, dl, VT, Tmp5, DAG.getConstant(8, dl, SHVT), + Mask, EVL); + Tmp4 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(8, dl, SHVT), + Mask, EVL); + Tmp4 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp4, + DAG.getConstant(255ULL << 24, dl, VT), Mask, EVL); + Tmp3 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(24, dl, SHVT), + Mask, EVL); + Tmp3 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp3, + DAG.getConstant(255ULL << 16, dl, VT), Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(40, dl, SHVT), + Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_AND, dl, VT, Tmp2, + DAG.getConstant(255ULL << 8, dl, VT), Mask, EVL); + Tmp1 = DAG.getNode(ISD::VP_LSHR, dl, VT, Op, DAG.getConstant(56, dl, SHVT), + Mask, EVL); + Tmp8 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp7, Mask, EVL); + Tmp6 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp6, Tmp5, Mask, EVL); + Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp3, Mask, EVL); + Tmp2 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp2, Tmp1, Mask, EVL); + Tmp8 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp6, Mask, EVL); + Tmp4 = DAG.getNode(ISD::VP_OR, dl, VT, Tmp4, Tmp2, Mask, EVL); + return DAG.getNode(ISD::VP_OR, dl, VT, Tmp8, Tmp4, Mask, EVL); + } +} + SDValue TargetLowering::expandBITREVERSE(SDNode *N, SelectionDAG &DAG) const { SDLoc dl(N); EVT VT = N->getValueType(0); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -591,6 +591,7 @@ Expand); setOperationAction(ISD::BSWAP, VT, Expand); + setOperationAction(ISD::VP_BSWAP, VT, Expand); // Custom-lower extensions and truncations from/to mask types. setOperationAction({ISD::ANY_EXTEND, ISD::SIGN_EXTEND, ISD::ZERO_EXTEND}, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -415,6 +415,32 @@ {Intrinsic::bswap, MVT::nxv2i64, 31}, {Intrinsic::bswap, MVT::nxv4i64, 31}, {Intrinsic::bswap, MVT::nxv8i64, 31}, + {Intrinsic::vp_bswap, MVT::v2i16, 3}, + {Intrinsic::vp_bswap, MVT::v4i16, 3}, + {Intrinsic::vp_bswap, MVT::v8i16, 3}, + {Intrinsic::vp_bswap, MVT::v16i16, 3}, + {Intrinsic::vp_bswap, MVT::nxv1i16, 3}, + {Intrinsic::vp_bswap, MVT::nxv2i16, 3}, + {Intrinsic::vp_bswap, MVT::nxv4i16, 3}, + {Intrinsic::vp_bswap, MVT::nxv8i16, 3}, + {Intrinsic::vp_bswap, MVT::nxv16i16, 3}, + {Intrinsic::vp_bswap, MVT::v2i32, 12}, + {Intrinsic::vp_bswap, MVT::v4i32, 12}, + {Intrinsic::vp_bswap, MVT::v8i32, 12}, + {Intrinsic::vp_bswap, MVT::v16i32, 12}, + {Intrinsic::vp_bswap, MVT::nxv1i32, 12}, + {Intrinsic::vp_bswap, MVT::nxv2i32, 12}, + {Intrinsic::vp_bswap, MVT::nxv4i32, 12}, + {Intrinsic::vp_bswap, MVT::nxv8i32, 12}, + {Intrinsic::vp_bswap, MVT::nxv16i32, 12}, + {Intrinsic::vp_bswap, MVT::v2i64, 31}, + {Intrinsic::vp_bswap, MVT::v4i64, 31}, + {Intrinsic::vp_bswap, MVT::v8i64, 31}, + {Intrinsic::vp_bswap, MVT::v16i64, 31}, + {Intrinsic::vp_bswap, MVT::nxv1i64, 31}, + {Intrinsic::vp_bswap, MVT::nxv2i64, 31}, + {Intrinsic::vp_bswap, MVT::nxv4i64, 31}, + {Intrinsic::vp_bswap, MVT::nxv8i64, 31}, {Intrinsic::bitreverse, MVT::v2i8, 17}, {Intrinsic::bitreverse, MVT::v4i8, 17}, {Intrinsic::bitreverse, MVT::v8i8, 17}, diff --git a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll --- a/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll +++ b/llvm/test/Analysis/CostModel/RISCV/int-bit-manip.ll @@ -242,6 +242,66 @@ ret void } +define void @vp_bswap() { +; CHECK-LABEL: 'vp_bswap' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call @llvm.vp.bswap.nxv1i16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call @llvm.vp.bswap.nxv2i16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call @llvm.vp.bswap.nxv4i16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call @llvm.vp.bswap.nxv8i16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call @llvm.vp.bswap.nxv16i16( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %10 = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %11 = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %12 = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %13 = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %14 = call @llvm.vp.bswap.nxv1i32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %15 = call @llvm.vp.bswap.nxv2i32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %16 = call @llvm.vp.bswap.nxv4i32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %17 = call @llvm.vp.bswap.nxv8i32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %18 = call @llvm.vp.bswap.nxv16i32( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %19 = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %20 = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %21 = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %22 = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %23 = call @llvm.vp.bswap.nxv1i64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %24 = call @llvm.vp.bswap.nxv2i64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %25 = call @llvm.vp.bswap.nxv4i64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %26 = call @llvm.vp.bswap.nxv8i64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %27 = call @llvm.vp.bswap.nxv16i64( undef, undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef) + call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef) + call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef) + call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef) + call @llvm.vp.bswap.nvx1i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx2i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx4i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx8i16( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx16i16( undef, undef, i32 undef) + call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef) + call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef) + call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef) + call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef) + call @llvm.vp.bswap.nvx1i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx2i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx4i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx8i32( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx16i32( undef, undef, i32 undef) + call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef) + call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef) + call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef) + call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef) + call @llvm.vp.bswap.nvx1i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx2i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx4i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx8i64( undef, undef, i32 undef) + call @llvm.vp.bswap.nvx16i64( undef, undef, i32 undef) + ret void +} declare i16 @llvm.bswap.i16(i16) declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) @@ -356,3 +416,30 @@ declare @llvm.ctpop.nvx8i64() declare @llvm.ctpop.nvx16i64() +declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32) +declare <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16>, <4 x i1>, i32) +declare <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16>, <8 x i1>, i32) +declare <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16>, <16 x i1>, i32) +declare @llvm.vp.bswap.nvx1i16(, , i32) +declare @llvm.vp.bswap.nvx2i16(, , i32) +declare @llvm.vp.bswap.nvx4i16(, , i32) +declare @llvm.vp.bswap.nvx8i16(, , i32) +declare @llvm.vp.bswap.nvx16i16(, , i32) +declare <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32>, <2 x i1>, i32) +declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32) +declare <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32>, <8 x i1>, i32) +declare <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32>, <16 x i1>, i32) +declare @llvm.vp.bswap.nvx1i32(, , i32) +declare @llvm.vp.bswap.nvx2i32(, , i32) +declare @llvm.vp.bswap.nvx4i32(, , i32) +declare @llvm.vp.bswap.nvx8i32(, , i32) +declare @llvm.vp.bswap.nvx16i32(, , i32) +declare <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64>, <2 x i1>, i32) +declare <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64>, <4 x i1>, i32) +declare <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64>, <8 x i1>, i32) +declare <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64>, <16 x i1>, i32) +declare @llvm.vp.bswap.nvx1i64(, , i32) +declare @llvm.vp.bswap.nvx2i64(, , i32) +declare @llvm.vp.bswap.nvx4i64(, , i32) +declare @llvm.vp.bswap.nvx8i64(, , i32) +declare @llvm.vp.bswap.nvx16i64(, , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll @@ -0,0 +1,1592 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +declare @llvm.vp.bswap.nxv1i16(, , i32) + +define @vp_bswap_nxv1i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv1i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv1i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv1i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv1i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv1i16( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv2i16(, , i32) + +define @vp_bswap_nxv2i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv2i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv2i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv2i16( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv4i16(, , i32) + +define @vp_bswap_nxv4i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv4i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv4i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv4i16( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv8i16(, , i32) + +define @vp_bswap_nxv8i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv8i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv8i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsrl.vi v10, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv8i16( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv16i16(, , i32) + +define @vp_bswap_nxv16i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsrl.vi v12, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv16i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv16i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vsrl.vi v12, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv16i16( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv32i16(, , i32) + +define @vp_bswap_nxv32i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv32i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v16, v0.t +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv32i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv32i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv32i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v16, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v16 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv32i16( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv1i32(, , i32) + +define @vp_bswap_nxv1i32( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0, v0.t +; RV32-NEXT: vsll.vi v10, v10, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV64-NEXT: vor.vv v9, v9, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v10, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv1i32( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv1i32_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv1i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv1i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv1i32( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv2i32(, , i32) + +define @vp_bswap_nxv2i32( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0, v0.t +; RV32-NEXT: vsll.vi v10, v10, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV64-NEXT: vor.vv v9, v9, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v10, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv2i32( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv2i32_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv2i32( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv4i32(, , i32) + +define @vp_bswap_nxv4i32( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v10, v10, a0, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0, v0.t +; RV32-NEXT: vsll.vi v12, v12, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v10, v10, a0, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV64-NEXT: vor.vv v10, v10, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vsll.vi v12, v12, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv4i32( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv4i32_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv4i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv4i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vor.vv v10, v10, v12 +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsll.vi v12, v12, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv4i32( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv8i32(, , i32) + +define @vp_bswap_nxv8i32( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v12, v12, a0, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV32-NEXT: vor.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a0, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v12, v12, a0, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV64-NEXT: vor.vv v12, v12, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v16, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv8i32( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv8i32_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv8i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv8i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsrl.vi v16, v8, 24 +; RV64-NEXT: vor.vv v12, v12, v16 +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv8i32( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv16i32(, , i32) + +define @vp_bswap_nxv16i32( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv16i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v16, v16, a0, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a0, v0.t +; RV32-NEXT: vsll.vi v24, v24, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv16i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v24, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v24, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv16i32( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv16i32_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv16i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vsrl.vi v16, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vand.vx v24, v8, a0 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv16i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vsrl.vi v16, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v16, v16, a0 +; RV64-NEXT: vsrl.vi v24, v8, 24 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: vand.vx v24, v8, a0 +; RV64-NEXT: vsll.vi v24, v24, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv16i32( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv1i64(, , i32) + +define @vp_bswap_nxv1i64( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vx v9, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v10, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v10, v10, a3, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v10, v8, a4, v0.t +; RV32-NEXT: vsll.vi v10, v10, 24, v0.t +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v11, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v12, v8, v11, v0.t +; RV32-NEXT: vsll.vi v12, v12, 8, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a3, v0.t +; RV32-NEXT: vand.vx v12, v12, a2, v0.t +; RV32-NEXT: vor.vv v10, v12, v10, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV32-NEXT: vand.vx v12, v12, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v11, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vand.vx v9, v8, a1, v0.t +; RV64-NEXT: vsll.vi v9, v9, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v10, 8, v0.t +; RV64-NEXT: vor.vv v9, v9, v10, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v10, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v11, v8, a3, v0.t +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v11, v11, a4, v0.t +; RV64-NEXT: vor.vv v10, v10, v11, v0.t +; RV64-NEXT: vor.vv v9, v10, v9, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v11, v8, a4, v0.t +; RV64-NEXT: vand.vx v11, v11, a3, v0.t +; RV64-NEXT: vor.vv v10, v11, v10, v0.t +; RV64-NEXT: vsrl.vi v11, v8, 24, v0.t +; RV64-NEXT: vand.vx v11, v11, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v11, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vor.vv v8, v9, v8, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv1i64( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv1i64_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv1i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsll.vx v9, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v10, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v10, v10, a3 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v10, v8, a4 +; RV32-NEXT: vsll.vi v10, v10, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v11, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v12, v8, v11 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: vsrl.vx v12, v8, a3 +; RV32-NEXT: vand.vx v12, v12, a2 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vand.vx v12, v12, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv1i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsll.vi v9, v9, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v10, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v11, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v11, v11, a4 +; RV64-NEXT: vor.vv v10, v10, v11 +; RV64-NEXT: vor.vv v9, v10, v9 +; RV64-NEXT: vsrl.vx v10, v8, a2 +; RV64-NEXT: vsrl.vx v11, v8, a4 +; RV64-NEXT: vand.vx v11, v11, a3 +; RV64-NEXT: vor.vv v10, v11, v10 +; RV64-NEXT: vsrl.vi v11, v8, 24 +; RV64-NEXT: vand.vx v11, v11, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v11 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv1i64( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv2i64(, , i32) + +define @vp_bswap_nxv2i64( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vx v10, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v12, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v12, v12, a3, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v12, v12, 24, v0.t +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v14, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v16, v8, v14, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vor.vv v12, v12, v16, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v12, v16, v12, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV32-NEXT: vand.vx v16, v16, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v14, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vand.vx v10, v8, a1, v0.t +; RV64-NEXT: vsll.vi v10, v10, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vsll.vi v12, v12, 8, v0.t +; RV64-NEXT: vor.vv v10, v10, v12, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v12, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v14, v8, a3, v0.t +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v14, v14, a4, v0.t +; RV64-NEXT: vor.vv v12, v12, v14, v0.t +; RV64-NEXT: vor.vv v10, v12, v10, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v14, v8, a4, v0.t +; RV64-NEXT: vand.vx v14, v14, a3, v0.t +; RV64-NEXT: vor.vv v12, v14, v12, v0.t +; RV64-NEXT: vsrl.vi v14, v8, 24, v0.t +; RV64-NEXT: vand.vx v14, v14, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v14, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vor.vv v8, v10, v8, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv2i64( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv2i64_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsll.vx v10, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v12, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v12, v12, a3 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v12, v8, a4 +; RV32-NEXT: vsll.vi v12, v12, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v14, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v16, v8, v14 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: vsrl.vx v16, v8, a3 +; RV32-NEXT: vand.vx v16, v16, a2 +; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vand.vx v16, v16, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsll.vi v10, v10, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsll.vi v12, v12, 8 +; RV64-NEXT: vor.vv v10, v10, v12 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v12, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v14, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v14, v14, a4 +; RV64-NEXT: vor.vv v12, v12, v14 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsrl.vx v12, v8, a2 +; RV64-NEXT: vsrl.vx v14, v8, a4 +; RV64-NEXT: vand.vx v14, v14, a3 +; RV64-NEXT: vor.vv v12, v14, v12 +; RV64-NEXT: vsrl.vi v14, v8, 24 +; RV64-NEXT: vand.vx v14, v14, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v14 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv2i64( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv4i64(, , i32) + +define @vp_bswap_nxv4i64( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vx v12, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v16, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v16, v16, a3, v0.t +; RV32-NEXT: vor.vv v16, v12, v16, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v20, v12, 24, v0.t +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v12, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v24, v8, v12, v0.t +; RV32-NEXT: vsll.vi v24, v24, 8, v0.t +; RV32-NEXT: vor.vv v20, v20, v24, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vsrl.vx v20, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v24, v8, a3, v0.t +; RV32-NEXT: vand.vx v24, v24, a2, v0.t +; RV32-NEXT: vor.vv v20, v24, v20, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: vand.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v20, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vand.vx v12, v8, a1, v0.t +; RV64-NEXT: vsll.vi v12, v12, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v16, 8, v0.t +; RV64-NEXT: vor.vv v12, v12, v16, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v16, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v20, v8, a3, v0.t +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v20, v20, a4, v0.t +; RV64-NEXT: vor.vv v16, v16, v20, v0.t +; RV64-NEXT: vor.vv v12, v16, v12, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v20, v8, a4, v0.t +; RV64-NEXT: vand.vx v20, v20, a3, v0.t +; RV64-NEXT: vor.vv v16, v20, v16, v0.t +; RV64-NEXT: vsrl.vi v20, v8, 24, v0.t +; RV64-NEXT: vand.vx v20, v20, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v20, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v12, v8, v0.t +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv4i64( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv4i64_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsll.vx v12, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v16, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v16, v16, a3 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m4, ta, ma +; RV32-NEXT: vlse64.v v20, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v24, v8, v20 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: vsrl.vx v24, v8, a3 +; RV32-NEXT: vand.vx v24, v24, a2 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: vand.vx v24, v24, a4 +; RV32-NEXT: vsrl.vi v8, v8, 8 +; RV32-NEXT: vand.vv v8, v8, v20 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v12, v12, v16 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v16, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v20, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v20, v20, a4 +; RV64-NEXT: vor.vv v16, v16, v20 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vsrl.vx v16, v8, a2 +; RV64-NEXT: vsrl.vx v20, v8, a4 +; RV64-NEXT: vand.vx v20, v20, a3 +; RV64-NEXT: vor.vv v16, v20, v16 +; RV64-NEXT: vsrl.vi v20, v8, 24 +; RV64-NEXT: vand.vx v20, v20, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v20 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv4i64( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv7i64(, , i32) + +define @vp_bswap_nxv7i64( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv7i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv7i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v16, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v24, 8, v0.t +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: vor.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t +; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vor.vv v24, v16, v24, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v8, v24, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv7i64( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv7i64_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv7i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv7i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0 +; RV64-NEXT: vsll.vi v24, v24, 8 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v0, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v0, v0, a4 +; RV64-NEXT: vor.vv v24, v24, v0 +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vsrl.vx v0, v8, a4 +; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v0, v8, 24 +; RV64-NEXT: vand.vx v0, v0, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv7i64( %va, %m, i32 %evl) + ret %v +} + +declare @llvm.vp.bswap.nxv8i64(, , i32) + +define @vp_bswap_nxv8i64( %va, %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2, v0.t +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 4 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v16, v8, v16, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a3, v0.t +; RV32-NEXT: vand.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v16, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v24, 8, v0.t +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: vor.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t +; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vor.vv v24, v16, v24, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v8, v24, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = call @llvm.vp.bswap.nxv8i64( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv8i64_unmasked( %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_nxv8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: sw zero, 12(sp) +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsll.vx v16, v8, a1 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: addi a2, a2, -256 +; RV32-NEXT: vand.vx v24, v8, a2 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: vsll.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v16, v24 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v16, v8, a4 +; RV32-NEXT: vsll.vi v0, v16, 24 +; RV32-NEXT: addi a5, sp, 8 +; RV32-NEXT: vsetvli a6, zero, e64, m8, ta, ma +; RV32-NEXT: vlse64.v v16, (a5), zero +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v8, v16 +; RV32-NEXT: vsll.vi v24, v24, 8 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v0, v8, a3 +; RV32-NEXT: vand.vx v0, v0, a2 +; RV32-NEXT: vsrl.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v0, v24 +; RV32-NEXT: vsrl.vi v0, v8, 8 +; RV32-NEXT: vand.vv v16, v0, v16 +; RV32-NEXT: vsrl.vi v8, v8, 24 +; RV32-NEXT: vand.vx v8, v8, a4 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v8, v24 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_nxv8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0 +; RV64-NEXT: vsll.vi v24, v24, 8 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v0, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v0, v0, a4 +; RV64-NEXT: vor.vv v24, v24, v0 +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vsrl.vx v0, v8, a4 +; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v0, v8, 24 +; RV64-NEXT: vand.vx v0, v0, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv8i64( %va, %m, i32 %evl) + ret %v +} + +; Test splitting. Use i16 version for easier check. +declare @llvm.vp.bswap.nxv64i16(, , i32) + +define @vp_bswap_nxv64i16( %va, %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv64i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a2 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v16, 8, v0.t +; CHECK-NEXT: vsll.vi v16, v16, 8, v0.t +; CHECK-NEXT: vor.vv v8, v16, v8, v0.t +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: bltu a0, a1, .LBB32_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB32_2: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.bswap.nxv64i16( %va, %m, i32 %evl) + ret %v +} + +define @vp_bswap_nxv64i16_unmasked( %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_nxv64i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub a2, a0, a1 +; CHECK-NEXT: sltu a3, a0, a2 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a2, a3, a2 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v24, v16, 8 +; CHECK-NEXT: vsll.vi v16, v16, 8 +; CHECK-NEXT: vor.vv v16, v16, v24 +; CHECK-NEXT: bltu a0, a1, .LBB33_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB33_2: +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v24, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v24 +; CHECK-NEXT: ret + %head = insertelement poison, i1 true, i32 0 + %m = shufflevector %head, poison, zeroinitializer + %v = call @llvm.vp.bswap.nxv64i16( %va, %m, i32 %evl) + ret %v +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -0,0 +1,1513 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+experimental-zvfh,+v,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 + +declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32) + +define <2 x i16> @vp_bswap_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> %va, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +define <2 x i16> @vp_bswap_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v2i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> %va, <2 x i1> %m, i32 %evl) + ret <2 x i16> %v +} + +declare <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16>, <4 x i1>, i32) + +define <4 x i16> @vp_bswap_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +define <4 x i16> @vp_bswap_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v4i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl) + ret <4 x i16> %v +} + +declare <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16>, <8 x i1>, i32) + +define <8 x i16> @vp_bswap_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %v = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> %va, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +define <8 x i16> @vp_bswap_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v8i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> %va, <8 x i1> %m, i32 %evl) + ret <8 x i16> %v +} + +declare <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16>, <16 x i1>, i32) + +define <16 x i16> @vp_bswap_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %v = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> %va, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +define <16 x i16> @vp_bswap_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v16i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vsrl.vi v10, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> %va, <16 x i1> %m, i32 %evl) + ret <16 x i16> %v +} + +declare <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32>, <2 x i1>, i32) + +define <2 x i32> @vp_bswap_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0, v0.t +; RV32-NEXT: vsll.vi v10, v10, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV64-NEXT: vor.vv v9, v9, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v10, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: ret + %v = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> %va, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v2i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v2i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> %va, <2 x i1> %m, i32 %evl) + ret <2 x i32> %v +} + +declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32) + +define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0, v0.t +; RV32-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV32-NEXT: vor.vv v9, v9, v10, v0.t +; RV32-NEXT: vand.vx v10, v8, a0, v0.t +; RV32-NEXT: vsll.vi v10, v10, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0, v0.t +; RV64-NEXT: vsrl.vi v10, v8, 24, v0.t +; RV64-NEXT: vor.vv v9, v9, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v10, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: ret + %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v4i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV32-NEXT: vsrl.vi v9, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vi v10, v8, 24 +; RV32-NEXT: vor.vv v9, v9, v10 +; RV32-NEXT: vand.vx v10, v8, a0 +; RV32-NEXT: vsll.vi v10, v10, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v4i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; RV64-NEXT: vsrl.vi v9, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v9, v9, a0 +; RV64-NEXT: vsrl.vi v10, v8, 24 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl) + ret <4 x i32> %v +} + +declare <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32>, <8 x i1>, i32) + +define <8 x i32> @vp_bswap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v10, v10, a0, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV32-NEXT: vor.vv v10, v10, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a0, v0.t +; RV32-NEXT: vsll.vi v12, v12, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v10, v10, a0, v0.t +; RV64-NEXT: vsrl.vi v12, v8, 24, v0.t +; RV64-NEXT: vor.vv v10, v10, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vsll.vi v12, v12, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: ret + %v = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> %va, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v8i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsll.vi v12, v12, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v8i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; RV64-NEXT: vsrl.vi v10, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v10, v10, a0 +; RV64-NEXT: vsrl.vi v12, v8, 24 +; RV64-NEXT: vor.vv v10, v10, v12 +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsll.vi v12, v12, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> %va, <8 x i1> %m, i32 %evl) + ret <8 x i32> %v +} + +declare <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32>, <16 x i1>, i32) + +define <16 x i32> @vp_bswap_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v16i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v12, v12, a0, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV32-NEXT: vor.vv v12, v12, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a0, v0.t +; RV32-NEXT: vsll.vi v16, v16, 8, v0.t +; RV32-NEXT: vsll.vi v8, v8, 24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v16i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v12, v12, a0, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV64-NEXT: vor.vv v12, v12, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v16, 8, v0.t +; RV64-NEXT: vsll.vi v8, v8, 24, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: ret + %v = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> %va, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v16i32_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, -256 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v16i32_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vsrl.vi v12, v8, 8 +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, -256 +; RV64-NEXT: vand.vx v12, v12, a0 +; RV64-NEXT: vsrl.vi v16, v8, 24 +; RV64-NEXT: vor.vv v12, v12, v16 +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vsll.vi v8, v8, 24 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> %va, <16 x i1> %m, i32 %evl) + ret <16 x i32> %v +} + +declare <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64>, <2 x i1>, i32) + +define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v9, v0 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vx v10, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v11, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v11, v11, a3, v0.t +; RV32-NEXT: vor.vv v10, v11, v10, v0.t +; RV32-NEXT: vsrl.vi v11, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v11, v11, a4, v0.t +; RV32-NEXT: vsrl.vi v12, v8, 8, v0.t +; RV32-NEXT: li a5, 5 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v13, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v13, v13, a5, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vmv1r.v v0, v9 +; RV32-NEXT: vand.vv v12, v12, v13, v0.t +; RV32-NEXT: vor.vv v11, v12, v11, v0.t +; RV32-NEXT: vor.vv v10, v11, v10, v0.t +; RV32-NEXT: vsll.vx v11, v8, a1, v0.t +; RV32-NEXT: vand.vx v12, v8, a3, v0.t +; RV32-NEXT: vsll.vx v12, v12, a2, v0.t +; RV32-NEXT: vor.vv v11, v11, v12, v0.t +; RV32-NEXT: vand.vx v12, v8, a4, v0.t +; RV32-NEXT: vsll.vi v12, v12, 24, v0.t +; RV32-NEXT: vand.vv v8, v8, v13, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v12, v8, v0.t +; RV32-NEXT: vor.vv v8, v11, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vand.vx v9, v8, a1, v0.t +; RV64-NEXT: vsll.vi v9, v9, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v10, v8, a0, v0.t +; RV64-NEXT: vsll.vi v10, v10, 8, v0.t +; RV64-NEXT: vor.vv v9, v9, v10, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v10, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v11, v8, a3, v0.t +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v11, v11, a4, v0.t +; RV64-NEXT: vor.vv v10, v10, v11, v0.t +; RV64-NEXT: vor.vv v9, v10, v9, v0.t +; RV64-NEXT: vsrl.vx v10, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v11, v8, a4, v0.t +; RV64-NEXT: vand.vx v11, v11, a3, v0.t +; RV64-NEXT: vor.vv v10, v11, v10, v0.t +; RV64-NEXT: vsrl.vi v11, v8, 24, v0.t +; RV64-NEXT: vand.vx v11, v11, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v11, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vor.vv v8, v9, v8, v0.t +; RV64-NEXT: ret + %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v2i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vsrl.vx v9, v8, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v10, v8, a2 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v10, v10, a3 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vsrl.vi v10, v8, 8 +; RV32-NEXT: li a4, 5 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.v.i v11, 0 +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: vmerge.vxm v11, v11, a4, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vand.vv v10, v10, v11 +; RV32-NEXT: vsrl.vi v12, v8, 24 +; RV32-NEXT: lui a0, 4080 +; RV32-NEXT: vand.vx v12, v12, a0 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vor.vv v9, v10, v9 +; RV32-NEXT: vsll.vx v10, v8, a1 +; RV32-NEXT: vand.vx v12, v8, a3 +; RV32-NEXT: vsll.vx v12, v12, a2 +; RV32-NEXT: vor.vv v10, v10, v12 +; RV32-NEXT: vand.vx v12, v8, a0 +; RV32-NEXT: vsll.vi v12, v12, 24 +; RV32-NEXT: vand.vv v8, v8, v11 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v2i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsll.vi v9, v9, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 8 +; RV64-NEXT: vor.vv v9, v9, v10 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v10, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v11, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v11, v11, a4 +; RV64-NEXT: vor.vv v10, v10, v11 +; RV64-NEXT: vor.vv v9, v10, v9 +; RV64-NEXT: vsrl.vx v10, v8, a2 +; RV64-NEXT: vsrl.vx v11, v8, a4 +; RV64-NEXT: vand.vx v11, v11, a3 +; RV64-NEXT: vor.vv v10, v11, v10 +; RV64-NEXT: vsrl.vi v11, v8, 24 +; RV64-NEXT: vand.vx v11, v11, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v11 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: ret + %head = insertelement <2 x i1> poison, i1 true, i32 0 + %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer + %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl) + ret <2 x i64> %v +} + +declare <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64>, <4 x i1>, i32) + +define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v4i64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vx v12, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v14, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v14, v14, a3, v0.t +; RV32-NEXT: vor.vv v12, v14, v12, v0.t +; RV32-NEXT: vsrl.vi v14, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v14, v14, a4, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: li a5, 85 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v18, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v18, v18, a5, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vand.vv v16, v16, v18, v0.t +; RV32-NEXT: vor.vv v14, v16, v14, v0.t +; RV32-NEXT: vor.vv v12, v14, v12, v0.t +; RV32-NEXT: vsll.vx v14, v8, a1, v0.t +; RV32-NEXT: vand.vx v16, v8, a3, v0.t +; RV32-NEXT: vsll.vx v16, v16, a2, v0.t +; RV32-NEXT: vor.vv v14, v14, v16, v0.t +; RV32-NEXT: vand.vx v16, v8, a4, v0.t +; RV32-NEXT: vsll.vi v16, v16, 24, v0.t +; RV32-NEXT: vand.vv v8, v8, v18, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v14, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v4i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vand.vx v10, v8, a1, v0.t +; RV64-NEXT: vsll.vi v10, v10, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v12, v8, a0, v0.t +; RV64-NEXT: vsll.vi v12, v12, 8, v0.t +; RV64-NEXT: vor.vv v10, v10, v12, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v12, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v14, v8, a3, v0.t +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v14, v14, a4, v0.t +; RV64-NEXT: vor.vv v12, v12, v14, v0.t +; RV64-NEXT: vor.vv v10, v12, v10, v0.t +; RV64-NEXT: vsrl.vx v12, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v14, v8, a4, v0.t +; RV64-NEXT: vand.vx v14, v14, a3, v0.t +; RV64-NEXT: vor.vv v12, v14, v12, v0.t +; RV64-NEXT: vsrl.vi v14, v8, 24, v0.t +; RV64-NEXT: vand.vx v14, v14, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v14, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vor.vv v8, v10, v8, v0.t +; RV64-NEXT: ret + %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v4i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vsrl.vx v10, v8, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v12, v12, a3 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsrl.vi v12, v8, 8 +; RV32-NEXT: li a4, 85 +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v14, 0 +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: vmerge.vxm v14, v14, a4, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vand.vv v12, v12, v14 +; RV32-NEXT: vsrl.vi v16, v8, 24 +; RV32-NEXT: lui a0, 4080 +; RV32-NEXT: vand.vx v16, v16, a0 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vor.vv v10, v12, v10 +; RV32-NEXT: vsll.vx v12, v8, a1 +; RV32-NEXT: vand.vx v16, v8, a3 +; RV32-NEXT: vsll.vx v16, v16, a2 +; RV32-NEXT: vor.vv v12, v12, v16 +; RV32-NEXT: vand.vx v16, v8, a0 +; RV32-NEXT: vsll.vi v16, v16, 24 +; RV32-NEXT: vand.vv v8, v8, v14 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v4i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsll.vi v10, v10, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsll.vi v12, v12, 8 +; RV64-NEXT: vor.vv v10, v10, v12 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v12, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v14, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v14, v14, a4 +; RV64-NEXT: vor.vv v12, v12, v14 +; RV64-NEXT: vor.vv v10, v12, v10 +; RV64-NEXT: vsrl.vx v12, v8, a2 +; RV64-NEXT: vsrl.vx v14, v8, a4 +; RV64-NEXT: vand.vx v14, v14, a3 +; RV64-NEXT: vor.vv v12, v14, v12 +; RV64-NEXT: vsrl.vi v14, v8, 24 +; RV64-NEXT: vand.vx v14, v14, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v14 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: ret + %head = insertelement <4 x i1> poison, i1 true, i32 0 + %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer + %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl) + ret <4 x i64> %v +} + +declare <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64>, <8 x i1>, i32) + +define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v8i64: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v20, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v20, v20, a3, v0.t +; RV32-NEXT: vor.vv v16, v20, v16, v0.t +; RV32-NEXT: vsrl.vi v20, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v24, v20, a4, v0.t +; RV32-NEXT: vsrl.vi v28, v8, 8, v0.t +; RV32-NEXT: lui a5, 5 +; RV32-NEXT: addi a5, a5, 1365 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.i v20, 0 +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmerge.vxm v20, v20, a5, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vand.vv v28, v28, v20, v0.t +; RV32-NEXT: vor.vv v24, v28, v24, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vsll.vx v24, v8, a1, v0.t +; RV32-NEXT: vand.vx v28, v8, a3, v0.t +; RV32-NEXT: vsll.vx v28, v28, a2, v0.t +; RV32-NEXT: vor.vv v24, v24, v28, v0.t +; RV32-NEXT: vand.vx v28, v8, a4, v0.t +; RV32-NEXT: vsll.vi v28, v28, 24, v0.t +; RV32-NEXT: vand.vv v8, v8, v20, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v28, v8, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v8i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vand.vx v12, v8, a1, v0.t +; RV64-NEXT: vsll.vi v12, v12, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v16, 8, v0.t +; RV64-NEXT: vor.vv v12, v12, v16, v0.t +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v16, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v20, v8, a3, v0.t +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v20, v20, a4, v0.t +; RV64-NEXT: vor.vv v16, v16, v20, v0.t +; RV64-NEXT: vor.vv v12, v16, v12, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v20, v8, a4, v0.t +; RV64-NEXT: vand.vx v20, v20, a3, v0.t +; RV64-NEXT: vor.vv v16, v20, v16, v0.t +; RV64-NEXT: vsrl.vi v20, v8, 24, v0.t +; RV64-NEXT: vand.vx v20, v20, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v20, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v12, v8, v0.t +; RV64-NEXT: ret + %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v8i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vsrl.vx v12, v8, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v16, v8, a2 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v16, v16, a3 +; RV32-NEXT: vor.vv v12, v16, v12 +; RV32-NEXT: vsrl.vi v20, v8, 8 +; RV32-NEXT: lui a4, 5 +; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vand.vv v20, v20, v16 +; RV32-NEXT: vsrl.vi v24, v8, 24 +; RV32-NEXT: lui a0, 4080 +; RV32-NEXT: vand.vx v24, v24, a0 +; RV32-NEXT: vor.vv v20, v20, v24 +; RV32-NEXT: vor.vv v12, v20, v12 +; RV32-NEXT: vsll.vx v20, v8, a1 +; RV32-NEXT: vand.vx v24, v8, a3 +; RV32-NEXT: vsll.vx v24, v24, a2 +; RV32-NEXT: vor.vv v20, v20, v24 +; RV32-NEXT: vand.vx v24, v8, a0 +; RV32-NEXT: vsll.vi v24, v24, 24 +; RV32-NEXT: vand.vv v8, v8, v16 +; RV32-NEXT: vsll.vi v8, v8, 8 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: vor.vv v8, v20, v8 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v8i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsll.vi v12, v12, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsll.vi v16, v16, 8 +; RV64-NEXT: vor.vv v12, v12, v16 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v16, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v20, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v20, v20, a4 +; RV64-NEXT: vor.vv v16, v16, v20 +; RV64-NEXT: vor.vv v12, v16, v12 +; RV64-NEXT: vsrl.vx v16, v8, a2 +; RV64-NEXT: vsrl.vx v20, v8, a4 +; RV64-NEXT: vand.vx v20, v20, a3 +; RV64-NEXT: vor.vv v16, v20, v16 +; RV64-NEXT: vsrl.vi v20, v8, 24 +; RV64-NEXT: vand.vx v20, v20, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v20 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: ret + %head = insertelement <8 x i1> poison, i1 true, i32 0 + %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer + %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl) + ret <8 x i64> %v +} + +declare <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64>, <15 x i1>, i32) + +define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v15i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v24, v24, a3, v0.t +; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: lui a5, 349525 +; RV32-NEXT: addi a5, a5, 1365 +; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: li a6, 32 +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vsetvli zero, a6, e32, m8, ta, ma +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vmerge.vxm v16, v24, a5, v0 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li a6, 24 +; RV32-NEXT: mul a5, a5, a6 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a0, a0, a5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v8, a3, v0.t +; RV32-NEXT: vsll.vx v24, v24, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a4, v0.t +; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v15i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v16, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v24, 8, v0.t +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: vor.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t +; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vor.vv v24, v16, v24, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v8, v24, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl) + ret <15 x i64> %v +} + +define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v15i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: lui a4, 349525 +; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: li a4, 32 +; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: lui a0, 4080 +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl8re8.v v0, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a3 +; RV32-NEXT: vsll.vx v0, v0, a2 +; RV32-NEXT: vsll.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v15i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0 +; RV64-NEXT: vsll.vi v24, v24, 8 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v0, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v0, v0, a4 +; RV64-NEXT: vor.vv v24, v24, v0 +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vsrl.vx v0, v8, a4 +; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v0, v8, 24 +; RV64-NEXT: vand.vx v0, v0, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: ret + %head = insertelement <15 x i1> poison, i1 true, i32 0 + %m = shufflevector <15 x i1> %head, <15 x i1> poison, <15 x i32> zeroinitializer + %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl) + ret <15 x i64> %v +} + +declare <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64>, <16 x i1>, i32) + +define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v16i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: vmv1r.v v1, v0 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v24, v24, a3, v0.t +; RV32-NEXT: vor.vv v24, v24, v16, v0.t +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 24, v0.t +; RV32-NEXT: lui a4, 4080 +; RV32-NEXT: vand.vx v24, v24, a4, v0.t +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: slli a5, a5, 3 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v24, (a5) # Unknown-size Folded Spill +; RV32-NEXT: lui a5, 349525 +; RV32-NEXT: addi a5, a5, 1365 +; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t +; RV32-NEXT: addi a6, sp, 16 +; RV32-NEXT: vs8r.v v24, (a6) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: li a6, 32 +; RV32-NEXT: vmv.s.x v0, a5 +; RV32-NEXT: vsetvli zero, a6, e32, m8, ta, ma +; RV32-NEXT: lui a5, 1044480 +; RV32-NEXT: vmv.v.i v24, 0 +; RV32-NEXT: vmerge.vxm v16, v24, a5, v0 +; RV32-NEXT: csrr a5, vlenb +; RV32-NEXT: li a6, 24 +; RV32-NEXT: mul a5, a5, a6 +; RV32-NEXT: add a5, sp, a5 +; RV32-NEXT: addi a5, a5, 16 +; RV32-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a5, 24 +; RV32-NEXT: mul a0, a0, a5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v16, v24, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v16, v24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: vsll.vx v16, v8, a1, v0.t +; RV32-NEXT: vand.vx v24, v8, a3, v0.t +; RV32-NEXT: vsll.vx v24, v24, a2, v0.t +; RV32-NEXT: vor.vv v16, v16, v24, v0.t +; RV32-NEXT: vand.vx v24, v8, a4, v0.t +; RV32-NEXT: vsll.vi v24, v24, 24, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 24 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vsll.vi v8, v8, 8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v16i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v16, 24, v0.t +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0, v0.t +; RV64-NEXT: vsll.vi v24, v24, 8, v0.t +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2, v0.t +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vand.vx v16, v8, a3, v0.t +; RV64-NEXT: vsll.vx v16, v16, a4, v0.t +; RV64-NEXT: vor.vv v16, v24, v16, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vl8re8.v v24, (a5) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v16, v16, v24, v0.t +; RV64-NEXT: addi a5, sp, 16 +; RV64-NEXT: vs8r.v v16, (a5) # Unknown-size Folded Spill +; RV64-NEXT: vsrl.vx v24, v8, a2, v0.t +; RV64-NEXT: vsrl.vx v16, v8, a4, v0.t +; RV64-NEXT: vand.vx v16, v16, a3, v0.t +; RV64-NEXT: vor.vv v24, v16, v24, v0.t +; RV64-NEXT: vsrl.vi v16, v8, 24, v0.t +; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 8, v0.t +; RV64-NEXT: vand.vx v8, v8, a0, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vor.vv v8, v8, v24, v0.t +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) { +; RV32-LABEL: vp_bswap_v16i64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: li a1, 56 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a1 +; RV32-NEXT: li a2, 40 +; RV32-NEXT: vsrl.vx v24, v8, a2 +; RV32-NEXT: lui a3, 16 +; RV32-NEXT: addi a3, a3, -256 +; RV32-NEXT: vand.vx v24, v24, a3 +; RV32-NEXT: vor.vv v16, v24, v16 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vi v24, v8, 8 +; RV32-NEXT: lui a4, 349525 +; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v0, a4 +; RV32-NEXT: li a4, 32 +; RV32-NEXT: vsetvli zero, a4, e32, m8, ta, ma +; RV32-NEXT: lui a4, 1044480 +; RV32-NEXT: vmv.v.i v16, 0 +; RV32-NEXT: vmerge.vxm v16, v16, a4, v0 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: lui a0, 4080 +; RV32-NEXT: vsrl.vi v0, v8, 24 +; RV32-NEXT: vand.vx v0, v0, a0 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vl8re8.v v0, (a4) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vand.vx v0, v8, a3 +; RV32-NEXT: vsll.vx v0, v0, a2 +; RV32-NEXT: vsll.vx v24, v8, a1 +; RV32-NEXT: vor.vv v24, v24, v0 +; RV32-NEXT: vand.vv v16, v8, v16 +; RV32-NEXT: vand.vx v8, v8, a0 +; RV32-NEXT: vsll.vi v8, v8, 24 +; RV32-NEXT: vsll.vi v16, v16, 8 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v24, v8 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_bswap_v16i64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: lui a1, 4080 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsll.vi v16, v16, 24 +; RV64-NEXT: li a0, 255 +; RV64-NEXT: slli a0, a0, 24 +; RV64-NEXT: vand.vx v24, v8, a0 +; RV64-NEXT: vsll.vi v24, v24, 8 +; RV64-NEXT: vor.vv v16, v16, v24 +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vsll.vx v24, v8, a2 +; RV64-NEXT: lui a3, 16 +; RV64-NEXT: addiw a3, a3, -256 +; RV64-NEXT: vand.vx v0, v8, a3 +; RV64-NEXT: li a4, 40 +; RV64-NEXT: vsll.vx v0, v0, a4 +; RV64-NEXT: vor.vv v24, v24, v0 +; RV64-NEXT: vor.vv v16, v24, v16 +; RV64-NEXT: vsrl.vx v24, v8, a2 +; RV64-NEXT: vsrl.vx v0, v8, a4 +; RV64-NEXT: vand.vx v0, v0, a3 +; RV64-NEXT: vor.vv v24, v0, v24 +; RV64-NEXT: vsrl.vi v0, v8, 24 +; RV64-NEXT: vand.vx v0, v0, a1 +; RV64-NEXT: vsrl.vi v8, v8, 8 +; RV64-NEXT: vand.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v0 +; RV64-NEXT: vor.vv v8, v8, v24 +; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: ret + %head = insertelement <16 x i1> poison, i1 true, i32 0 + %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer + %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl) + ret <16 x i64> %v +} + +declare <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16>, <128 x i1>, i32) + +define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v128i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: vslidedown.vi v24, v0, 8 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB26_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v16, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a0, -64 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t +; CHECK-NEXT: vor.vv v16, v8, v16, v0.t +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl) + ret <128 x i16> %v +} + +define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl) { +; CHECK-LABEL: vp_bswap_v128i16_unmasked: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 64 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: bltu a0, a2, .LBB27_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v24, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v24 +; CHECK-NEXT: addi a1, a0, -64 +; CHECK-NEXT: sltu a0, a0, a1 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsrl.vi v24, v16, 8 +; CHECK-NEXT: vsll.vi v16, v16, 8 +; CHECK-NEXT: vor.vv v16, v16, v24 +; CHECK-NEXT: ret + %head = insertelement <128 x i1> poison, i1 true, i32 0 + %m = shufflevector <128 x i1> %head, <128 x i1> poison, <128 x i32> zeroinitializer + %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl) + ret <128 x i16> %v +} diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -142,6 +142,9 @@ Str << " declare <8 x i1> @llvm.vp.icmp.v8i16" << "(<8 x i16>, <8 x i16>, metadata, <8 x i1>, i32) "; + Str << " declare <8 x i16> @llvm.vp.bswap.v8i16" + << "(<8 x i16>, <8 x i1>, i32) "; + return parseAssemblyString(Str.str(), Err, C); } };