diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -133,6 +133,39 @@ VECREDUCE_XOR, VECREDUCE_FADD, VECREDUCE_SEQ_FADD, + + // Vector binary and unary ops with VL as a third operand. + // FIXME: Can we replace these with ISD::VP_*? + ADD_VL, + AND_VL, + MUL_VL, + OR_VL, + SDIV_VL, + SHL_VL, + SREM_VL, + SRA_VL, + SRL_VL, + SUB_VL, + UDIV_VL, + UREM_VL, + XOR_VL, + FADD_VL, + FSUB_VL, + FMUL_VL, + FDIV_VL, + FNEG_VL, + + // Set mask vector to all zeros or ones. + VMCLR_VL, + VMSET_VL, + + // Memory opcodes start here. + VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE, + VSE_VL, + + // WARNING: Do not add anything in the end unless you want the node to + // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all + // opcodes will be thought as target memory ops! }; } // namespace RISCVISD @@ -336,6 +369,10 @@ SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerToScalableOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOpc) const; bool isEligibleForTailCallOptimization( CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF, @@ -346,6 +383,8 @@ void validateCCReservedRegs( const SmallVectorImpl> &Regs, MachineFunction &MF) const; + + bool useRVVForFixedLengthVectorVT(MVT VT) const; }; namespace RISCVVIntrinsicsTable { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -140,6 +140,32 @@ if (Subtarget.hasStdExtD()) for (MVT VT : F64VecVTs) addRegClassForRVV(VT); + + if (Subtarget.useRVVForFixedLengthVectors()) { + auto addRegClassForFixedVectors = [this](MVT VT) { + unsigned LMul = Subtarget.getLMULForFixedLengthVector(VT); + const TargetRegisterClass *RC; + if (LMul == 1) + RC = &RISCV::VRRegClass; + else if (LMul == 2) + RC = &RISCV::VRM2RegClass; + else if (LMul == 4) + RC = &RISCV::VRM4RegClass; + else if (LMul == 8) + RC = &RISCV::VRM8RegClass; + else + llvm_unreachable("Unexpected LMul!"); + + addRegisterClass(VT, RC); + }; + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) + if (useRVVForFixedLengthVectorVT(VT)) + addRegClassForFixedVectors(VT); + + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) + if (useRVVForFixedLengthVectorVT(VT)) + addRegClassForFixedVectors(VT); + } } // Compute derived properties from the register classes. @@ -484,6 +510,56 @@ if (Subtarget.hasStdExtD()) for (MVT VT : F64VecVTs) SetCommonVFPActions(VT); + + if (Subtarget.useRVVForFixedLengthVectors()) { + for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) { + if (!useRVVForFixedLengthVectorVT(VT)) + continue; + + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::AND, VT, Custom); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::XOR, VT, Custom); + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::SREM, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + setOperationAction(ISD::UREM, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + } + + for (MVT VT : MVT::fp_fixedlen_vector_valuetypes()) { + if (!useRVVForFixedLengthVectorVT(VT)) + continue; + + // By default everything must be expanded. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + // We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed. + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + + setOperationAction(ISD::LOAD, VT, Custom); + setOperationAction(ISD::STORE, VT, Custom); + setOperationAction(ISD::FADD, VT, Custom); + setOperationAction(ISD::FSUB, VT, Custom); + setOperationAction(ISD::FMUL, VT, Custom); + setOperationAction(ISD::FDIV, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); + } + } } // Function alignments. @@ -928,6 +1004,46 @@ case ISD::VECREDUCE_FADD: case ISD::VECREDUCE_SEQ_FADD: return lowerFPVECREDUCE(Op, DAG); + case ISD::LOAD: + return lowerFixedLengthVectorLoadToRVV(Op, DAG); + case ISD::STORE: + return lowerFixedLengthVectorStoreToRVV(Op, DAG); + case ISD::ADD: + return lowerToScalableOp(Op, DAG, RISCVISD::ADD_VL); + case ISD::SUB: + return lowerToScalableOp(Op, DAG, RISCVISD::SUB_VL); + case ISD::MUL: + return lowerToScalableOp(Op, DAG, RISCVISD::MUL_VL); + case ISD::AND: + return lowerToScalableOp(Op, DAG, RISCVISD::AND_VL); + case ISD::OR: + return lowerToScalableOp(Op, DAG, RISCVISD::OR_VL); + case ISD::XOR: + return lowerToScalableOp(Op, DAG, RISCVISD::XOR_VL); + case ISD::SDIV: + return lowerToScalableOp(Op, DAG, RISCVISD::SDIV_VL); + case ISD::SREM: + return lowerToScalableOp(Op, DAG, RISCVISD::SREM_VL); + case ISD::UDIV: + return lowerToScalableOp(Op, DAG, RISCVISD::UDIV_VL); + case ISD::UREM: + return lowerToScalableOp(Op, DAG, RISCVISD::UREM_VL); + case ISD::SHL: + return lowerToScalableOp(Op, DAG, RISCVISD::SHL_VL); + case ISD::SRA: + return lowerToScalableOp(Op, DAG, RISCVISD::SRA_VL); + case ISD::SRL: + return lowerToScalableOp(Op, DAG, RISCVISD::SRL_VL); + case ISD::FADD: + return lowerToScalableOp(Op, DAG, RISCVISD::FADD_VL); + case ISD::FSUB: + return lowerToScalableOp(Op, DAG, RISCVISD::FSUB_VL); + case ISD::FMUL: + return lowerToScalableOp(Op, DAG, RISCVISD::FMUL_VL); + case ISD::FDIV: + return lowerToScalableOp(Op, DAG, RISCVISD::FDIV_VL); + case ISD::FNEG: + return lowerToScalableOp(Op, DAG, RISCVISD::FNEG_VL); } } @@ -1742,6 +1858,137 @@ DAG.getConstant(0, DL, Subtarget.getXLenVT())); } +// Return the largest legal scalable vector type that matches VT's element type. +static MVT getContainerForFixedLengthVector(SelectionDAG &DAG, MVT VT, + const RISCVSubtarget &Subtarget) { + assert(VT.isFixedLengthVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && + "Expected legal fixed length vector!"); + + unsigned LMul = Subtarget.getLMULForFixedLengthVector(VT); + assert(LMul <= 8 && isPowerOf2_32(LMul) && "Unexpected LMUL!"); + + switch (VT.getVectorElementType().SimpleTy) { + default: + llvm_unreachable("unexpected element type for RVV container"); + case MVT::i8: + return MVT::getScalableVectorVT(MVT::i8, LMul * 8); + case MVT::i16: + return MVT::getScalableVectorVT(MVT::i16, LMul * 4); + case MVT::i32: + return MVT::getScalableVectorVT(MVT::i32, LMul * 2); + case MVT::i64: + return MVT::getScalableVectorVT(MVT::i64, LMul); + case MVT::f16: + return MVT::getScalableVectorVT(MVT::f16, LMul * 4); + case MVT::f32: + return MVT::getScalableVectorVT(MVT::f32, LMul * 2); + case MVT::f64: + return MVT::getScalableVectorVT(MVT::f64, LMul); + } +} + +// Grow V to consume an entire RVV register. +static SDValue convertToScalableVector(EVT VT, SDValue V, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(VT.isScalableVector() && + "Expected to convert into a scalable vector!"); + assert(V.getValueType().isFixedLengthVector() && + "Expected a fixed length vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero); +} + +// Shrink V so it's just big enough to maintain a VT's worth of data. +static SDValue convertFromScalableVector(EVT VT, SDValue V, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + assert(VT.isFixedLengthVector() && + "Expected to convert into a fixed length vector!"); + assert(V.getValueType().isScalableVector() && + "Expected a scalable vector operand!"); + SDLoc DL(V); + SDValue Zero = DAG.getConstant(0, DL, Subtarget.getXLenVT()); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero); +} + +SDValue +RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op, + SelectionDAG &DAG) const { + auto *Load = cast(Op); + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + + SDValue VL = + DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT()); + + SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); + SDValue NewLoad = DAG.getMemIntrinsicNode( + RISCVISD::VLE_VL, DL, VTs, {Load->getChain(), Load->getBasePtr(), VL}, + Load->getMemoryVT(), Load->getMemOperand()); + + SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget); + return DAG.getMergeValues({Result, Load->getChain()}, DL); +} + +SDValue +RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op, + SelectionDAG &DAG) const { + auto *Store = cast(Op); + + SDLoc DL(Op); + MVT VT = Store->getValue().getSimpleValueType(); + MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + + SDValue VL = + DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT()); + + SDValue NewValue = + convertToScalableVector(ContainerVT, Store->getValue(), DAG, Subtarget); + return DAG.getMemIntrinsicNode( + RISCVISD::VSE_VL, DL, DAG.getVTList(MVT::Other), + {Store->getChain(), NewValue, Store->getBasePtr(), VL}, + Store->getMemoryVT(), Store->getMemOperand()); +} + +SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG, + unsigned NewOpc) const { + MVT VT = Op.getSimpleValueType(); + assert(useRVVForFixedLengthVectorVT(VT) && + "Only expected to lower fixed length vector operation!"); + MVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); + + // Create list of operands by converting existing ones to scalable types. + SmallVector Ops; + for (const SDValue &V : Op->op_values()) { + assert(!isa(V) && "Unexpected VTSDNode node!"); + + // Pass through non-vector operands. + if (!V.getValueType().isVector()) { + Ops.push_back(V); + continue; + } + + // "cast" fixed length vector to a scalable vector. + assert(useRVVForFixedLengthVectorVT(V.getSimpleValueType()) && + "Only fixed length vectors are supported!"); + Ops.push_back(convertToScalableVector(ContainerVT, V, DAG, Subtarget)); + } + + SDLoc DL(Op); + SDValue VL = + DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT()); + MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); + SDValue Mask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); + Ops.push_back(Mask); + Ops.push_back(VL); + + SDValue ScalableRes = DAG.getNode(NewOpc, DL, ContainerVT, Ops); + return convertFromScalableVector(VT, ScalableRes, DAG, Subtarget); +} + // Returns the opcode of the target-specific SDNode that implements the 32-bit // form of the given Opcode. static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) { @@ -4310,6 +4557,28 @@ NODE_NAME_CASE(VECREDUCE_XOR) NODE_NAME_CASE(VECREDUCE_FADD) NODE_NAME_CASE(VECREDUCE_SEQ_FADD) + NODE_NAME_CASE(ADD_VL) + NODE_NAME_CASE(AND_VL) + NODE_NAME_CASE(MUL_VL) + NODE_NAME_CASE(OR_VL) + NODE_NAME_CASE(SDIV_VL) + NODE_NAME_CASE(SHL_VL) + NODE_NAME_CASE(SREM_VL) + NODE_NAME_CASE(SRA_VL) + NODE_NAME_CASE(SRL_VL) + NODE_NAME_CASE(SUB_VL) + NODE_NAME_CASE(UDIV_VL) + NODE_NAME_CASE(UREM_VL) + NODE_NAME_CASE(XOR_VL) + NODE_NAME_CASE(FADD_VL) + NODE_NAME_CASE(FSUB_VL) + NODE_NAME_CASE(FMUL_VL) + NODE_NAME_CASE(FDIV_VL) + NODE_NAME_CASE(FNEG_VL) + NODE_NAME_CASE(VMCLR_VL) + NODE_NAME_CASE(VMSET_VL) + NODE_NAME_CASE(VLE_VL) + NODE_NAME_CASE(VSE_VL) } // clang-format on return nullptr; @@ -4747,6 +5016,50 @@ return false; } +bool RISCVTargetLowering::useRVVForFixedLengthVectorVT(MVT VT) const { + if (!Subtarget.useRVVForFixedLengthVectors()) + return false; + + if (!VT.isFixedLengthVector()) + return false; + + // Don't use RVV for vectors we cannot scalarize if required. + switch (VT.getVectorElementType().SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + break; + case MVT::f16: + if (!Subtarget.hasStdExtZfh()) + return false; + break; + case MVT::f32: + if (!Subtarget.hasStdExtF()) + return false; + break; + case MVT::f64: + if (!Subtarget.hasStdExtD()) + return false; + break; + } + + unsigned LMul = Subtarget.getLMULForFixedLengthVector(VT); + // Don't use RVV for types that don't fit. + if (LMul > Subtarget.getMaxLMULForFixedLengthVectors()) + return false; + + // TODO: Perhaps an artificial restriction, but worth having whilst getting + // the base fixed length RVV support in place. + if (!VT.isPow2VectorType()) + return false; + + return true; +} + #define GET_REGISTER_MATCHER #include "RISCVGenAsmMatcher.inc" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -4435,3 +4435,4 @@ // Include the non-intrinsic ISel patterns include "RISCVInstrInfoVSDPatterns.td" +include "RISCVInstrInfoVVLPatterns.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -0,0 +1,190 @@ +//===- RISCVInstrInfoVVLPatterns.td - RVV VL patterns ------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This file contains the required infrastructure and VL patterns to +/// support code generation for the standard 'V' (Vector) extension, version +/// 0.10. This version is still experimental as the 'V' extension hasn't been +/// ratified yet. +/// +/// This file is included from and depends upon RISCVInstrInfoVPseudos.td +/// +/// Note: the patterns for RVV intrinsics are found in +/// RISCVInstrInfoVPseudos.td. +/// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Helpers to define the VL patterns. +//===----------------------------------------------------------------------===// + +def SDT_RISCVVLE_VL : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisPtrTy<1>, + SDTCisVT<2, XLenVT>]>; +def SDT_RISCVVSE_VL : SDTypeProfile<0, 3, [SDTCisVec<0>, SDTCisPtrTy<1>, + SDTCisVT<2, XLenVT>]>; + +def SDT_RISCVIntBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisVec<0>, SDTCisInt<0>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, XLenVT>]>; + +def SDT_RISCVFPUnOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, + SDTCisVec<0>, SDTCisFP<0>, + SDTCVecEltisVT<2, i1>, + SDTCisSameNumEltsAs<0, 2>, + SDTCisVT<3, XLenVT>]>; +def SDT_RISCVFPBinOp_VL : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisVec<0>, SDTCisFP<0>, + SDTCVecEltisVT<3, i1>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, XLenVT>]>; + +def riscv_vle_vl : SDNode<"RISCVISD::VLE_VL", SDT_RISCVVLE_VL, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def riscv_vse_vl : SDNode<"RISCVISD::VSE_VL", SDT_RISCVVSE_VL, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def riscv_add_vl : SDNode<"RISCVISD::ADD_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_sub_vl : SDNode<"RISCVISD::SUB_VL", SDT_RISCVIntBinOp_VL>; +def riscv_mul_vl : SDNode<"RISCVISD::MUL_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_and_vl : SDNode<"RISCVISD::AND_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_or_vl : SDNode<"RISCVISD::OR_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_xor_vl : SDNode<"RISCVISD::XOR_VL", SDT_RISCVIntBinOp_VL, [SDNPCommutative]>; +def riscv_sdiv_vl : SDNode<"RISCVISD::SDIV_VL", SDT_RISCVIntBinOp_VL>; +def riscv_srem_vl : SDNode<"RISCVISD::SREM_VL", SDT_RISCVIntBinOp_VL>; +def riscv_udiv_vl : SDNode<"RISCVISD::UDIV_VL", SDT_RISCVIntBinOp_VL>; +def riscv_urem_vl : SDNode<"RISCVISD::UREM_VL", SDT_RISCVIntBinOp_VL>; +def riscv_shl_vl : SDNode<"RISCVISD::SHL_VL", SDT_RISCVIntBinOp_VL>; +def riscv_sra_vl : SDNode<"RISCVISD::SRA_VL", SDT_RISCVIntBinOp_VL>; +def riscv_srl_vl : SDNode<"RISCVISD::SRL_VL", SDT_RISCVIntBinOp_VL>; +def riscv_fadd_vl : SDNode<"RISCVISD::FADD_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>; +def riscv_fsub_vl : SDNode<"RISCVISD::FSUB_VL", SDT_RISCVFPBinOp_VL>; +def riscv_fmul_vl : SDNode<"RISCVISD::FMUL_VL", SDT_RISCVFPBinOp_VL, [SDNPCommutative]>; +def riscv_fdiv_vl : SDNode<"RISCVISD::FDIV_VL", SDT_RISCVFPBinOp_VL>; +def riscv_fneg_vl : SDNode<"RISCVISD::FNEG_VL", SDT_RISCVFPUnOp_VL>; + +def SDT_RISCVVMSETCLR_VL : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCVecEltisVT<0, i1>, + SDTCisVT<1, XLenVT>]>; +def riscv_vmclr_vl : SDNode<"RISCVISD::VMCLR_VL", SDT_RISCVVMSETCLR_VL>; +def riscv_vmset_vl : SDNode<"RISCVISD::VMSET_VL", SDT_RISCVVMSETCLR_VL>; + +def true_mask : PatLeaf<(riscv_vmset_vl (XLenVT srcvalue))>; + +class VPatBinaryVL_VV : + Pat<(result_type (vop + (op_type op_reg_class:$rs1), + (op_type op_reg_class:$rs2), + (mask_type true_mask), + (XLenVT (VLOp GPR:$vl)))), + (!cast(instruction_name#"_VV_"# vlmul.MX) + op_reg_class:$rs1, + op_reg_class:$rs2, + GPR:$vl, sew)>; + +multiclass VPatBinaryVL_VV_VX { + foreach vti = AllIntegerVectors in { + def : VPatBinaryVL_VV; + // FIXME: Support splats. + } +} + +multiclass VPatBinaryVL_VV_VX_VI { + foreach vti = AllIntegerVectors in { + def : VPatBinaryVL_VV; + // FIXME: Support splats. + } +} + +multiclass VPatBinaryFPVL_VV_VF { + foreach vti = AllFloatVectors in { + def : VPatBinaryVL_VV; + // FIXME: Support splats. + } +} + +//===----------------------------------------------------------------------===// +// Patterns. +//===----------------------------------------------------------------------===// + +let Predicates = [HasStdExtV] in { + +// 7.4. Vector Unit-Stride Instructions +foreach vti = AllVectors in { + defvar load_instr = !cast("PseudoVLE"#vti.SEW#"_V_"#vti.LMul.MX); + defvar store_instr = !cast("PseudoVSE"#vti.SEW#"_V_"#vti.LMul.MX); + // Load + def : Pat<(vti.Vector (riscv_vle_vl RVVBaseAddr:$rs1, (XLenVT (VLOp GPR:$vl)))), + (load_instr RVVBaseAddr:$rs1, GPR:$vl, vti.SEW)>; + // Store + def : Pat<(riscv_vse_vl (vti.Vector vti.RegClass:$rs2), RVVBaseAddr:$rs1, (XLenVT (VLOp GPR:$vl))), + (store_instr vti.RegClass:$rs2, RVVBaseAddr:$rs1, GPR:$vl, vti.SEW)>; +} + +// 12.1. Vector Single-Width Integer Add and Subtract +defm "" : VPatBinaryVL_VV_VX_VI; +defm "" : VPatBinaryVL_VV_VX; + +// 12.5. Vector Bitwise Logical Instructions +defm "" : VPatBinaryVL_VV_VX_VI; +defm "" : VPatBinaryVL_VV_VX_VI; +defm "" : VPatBinaryVL_VV_VX_VI; + +// 12.6. Vector Single-Width Bit Shift Instructions +defm "" : VPatBinaryVL_VV_VX_VI; +defm "" : VPatBinaryVL_VV_VX_VI; +defm "" : VPatBinaryVL_VV_VX_VI; + +// 12.10. Vector Single-Width Integer Multiply Instructions +defm "" : VPatBinaryVL_VV_VX; + +// 12.11. Vector Integer Divide Instructions +defm "" : VPatBinaryVL_VV_VX; +defm "" : VPatBinaryVL_VV_VX; +defm "" : VPatBinaryVL_VV_VX; +defm "" : VPatBinaryVL_VV_VX; + +} // Predicates = [HasStdExtV] + +let Predicates = [HasStdExtV, HasStdExtF] in { + +// 14.2. Vector Single-Width Floating-Point Add/Subtract Instructions +defm "" : VPatBinaryFPVL_VV_VF; +defm "" : VPatBinaryFPVL_VV_VF; + +// 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions +defm "" : VPatBinaryFPVL_VV_VF; +defm "" : VPatBinaryFPVL_VV_VF; + +// 14.10. Vector Floating-Point Sign-Injection Instructions +// Handle fneg with VFSGNJN using the same input for both operands. +foreach vti = AllFloatVectors in { + def : Pat<(riscv_fneg_vl (vti.Vector vti.RegClass:$rs), (vti.Mask true_mask), + (XLenVT (VLOp GPR:$vl))), + (!cast("PseudoVFSGNJN_VV_"# vti.LMul.MX) + vti.RegClass:$rs, vti.RegClass:$rs, GPR:$vl, vti.SEW)>; +} + +} // Predicates = [HasStdExtV, HasStdExtF] diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -147,6 +147,14 @@ InstructionSelector *getInstructionSelector() const override; const LegalizerInfo *getLegalizerInfo() const override; const RegisterBankInfo *getRegBankInfo() const override; + + // Return the known range for the bit length of RVV data registers. A value + // of 0 means nothing is known about that particular limit beyond what's + // implied by the architecture. + unsigned getMinRVVVectorSizeInBits() const; + unsigned getLMULForFixedLengthVector(MVT VT) const; + unsigned getMaxLMULForFixedLengthVectors() const; + bool useRVVForFixedLengthVectors() const; }; } // End llvm namespace diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -27,6 +27,18 @@ #define GET_SUBTARGETINFO_CTOR #include "RISCVGenSubtargetInfo.inc" +static cl::opt RVVVectorBitsMin( + "riscv-v-vector-bits-min", + cl::desc("Assume V extension vector registers are at least this big, " + "with zero meaning no minimum size is assumed."), + cl::init(0), cl::Hidden); + +static cl::opt RVVVectorLMULMax( + "riscv-v-fixed-length-vector-lmul-max", + cl::desc("The maximum LMUL value to use for fixed length vectors. " + "Fractional LMUL values are not supported."), + cl::init(8), cl::Hidden); + void RISCVSubtarget::anchor() {} RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies( @@ -81,3 +93,30 @@ const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const { return RegBankInfo.get(); } + +unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const { + assert(hasStdExtV() && + "Tried to get vector length without V extension support!"); + assert((RVVVectorBitsMin == 0 || + (RVVVectorBitsMin >= 128 && isPowerOf2_32(RVVVectorBitsMin))) && + "V extension requires vector length to be at least 128 and a power of " + "2!"); + return PowerOf2Floor(RVVVectorBitsMin < 128 ? 0 : RVVVectorBitsMin); +} + +unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const { + assert(hasStdExtV() && + "Tried to get maximum LMUL without V extension support!"); + assert(RVVVectorLMULMax <= 8 && isPowerOf2_32(RVVVectorLMULMax) && + "V extension requires a LMUL to be at most 8 and a power of 2!"); + return PowerOf2Floor(std::max(RVVVectorLMULMax, 1)); +} + +bool RISCVSubtarget::useRVVForFixedLengthVectors() const { + return hasStdExtV() && getMinRVVVectorSizeInBits() != 0; +} + +unsigned RISCVSubtarget::getLMULForFixedLengthVector(MVT VT) const { + unsigned MinVLen = getMinRVVVectorSizeInBits(); + return divideCeil(VT.getSizeInBits(), MinVLen); +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -0,0 +1,926 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v,+experimental-zfh,+f,+d -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1,LMULMAX1-RV64 + +define void @fadd_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fadd_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfadd.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fadd <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fadd_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fadd_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfadd.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fadd <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fadd_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fadd_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfadd.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fadd <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fsub_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fsub_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfsub.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fsub <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fsub_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fsub_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfsub.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fsub <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fsub_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fsub_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfsub.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fsub <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fmul_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fmul_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfmul.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fmul <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fmul_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fmul_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfmul.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fmul <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fmul_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fmul_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfmul.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fmul <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fdiv_v8f16(<8 x half>* %x, <8 x half>* %y) { +; CHECK-LABEL: fdiv_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vfdiv.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = load <8 x half>, <8 x half>* %y + %c = fdiv <8 x half> %a, %b + store <8 x half> %c, <8 x half>* %x + ret void +} + +define void @fdiv_v4f32(<4 x float>* %x, <4 x float>* %y) { +; CHECK-LABEL: fdiv_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vfdiv.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = load <4 x float>, <4 x float>* %y + %c = fdiv <4 x float> %a, %b + store <4 x float> %c, <4 x float>* %x + ret void +} + +define void @fdiv_v2f64(<2 x double>* %x, <2 x double>* %y) { +; CHECK-LABEL: fdiv_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vfdiv.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = load <2 x double>, <2 x double>* %y + %c = fdiv <2 x double> %a, %b + store <2 x double> %c, <2 x double>* %x + ret void +} + +define void @fneg_v8f16(<8 x half>* %x) { +; CHECK-LABEL: fneg_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 8 +; CHECK-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vfsgnjn.vv v25, v25, v25 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x half>, <8 x half>* %x + %b = fneg <8 x half> %a + store <8 x half> %b, <8 x half>* %x + ret void +} + +define void @fneg_v4f32(<4 x float>* %x) { +; CHECK-LABEL: fneg_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 4 +; CHECK-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vfsgnjn.vv v25, v25, v25 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x float>, <4 x float>* %x + %b = fneg <4 x float> %a + store <4 x float> %b, <4 x float>* %x + ret void +} + +define void @fneg_v2f64(<2 x double>* %x) { +; CHECK-LABEL: fneg_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 2 +; CHECK-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vfsgnjn.vv v25, v25, v25 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x double>, <2 x double>* %x + %b = fneg <2 x double> %a + store <2 x double> %b, <2 x double>* %x + ret void +} + +define void @fadd_v16f16(<16 x half>* %x, <16 x half>* %y) { +; LMULMAX2-LABEL: fadd_v16f16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vfadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fadd_v16f16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fadd_v16f16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x half>, <16 x half>* %x + %b = load <16 x half>, <16 x half>* %y + %c = fadd <16 x half> %a, %b + store <16 x half> %c, <16 x half>* %x + ret void +} + +define void @fadd_v8f32(<8 x float>* %x, <8 x float>* %y) { +; LMULMAX2-LABEL: fadd_v8f32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vfadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fadd_v8f32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fadd_v8f32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %b = load <8 x float>, <8 x float>* %y + %c = fadd <8 x float> %a, %b + store <8 x float> %c, <8 x float>* %x + ret void +} + +define void @fadd_v4f64(<4 x double>* %x, <4 x double>* %y) { +; LMULMAX2-LABEL: fadd_v4f64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vfadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fadd_v4f64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fadd_v4f64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x double>, <4 x double>* %x + %b = load <4 x double>, <4 x double>* %y + %c = fadd <4 x double> %a, %b + store <4 x double> %c, <4 x double>* %x + ret void +} + +define void @fsub_v16f16(<16 x half>* %x, <16 x half>* %y) { +; LMULMAX2-LABEL: fsub_v16f16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vfsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fsub_v16f16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fsub_v16f16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x half>, <16 x half>* %x + %b = load <16 x half>, <16 x half>* %y + %c = fsub <16 x half> %a, %b + store <16 x half> %c, <16 x half>* %x + ret void +} + +define void @fsub_v8f32(<8 x float>* %x, <8 x float>* %y) { +; LMULMAX2-LABEL: fsub_v8f32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vfsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fsub_v8f32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fsub_v8f32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %b = load <8 x float>, <8 x float>* %y + %c = fsub <8 x float> %a, %b + store <8 x float> %c, <8 x float>* %x + ret void +} + +define void @fsub_v4f64(<4 x double>* %x, <4 x double>* %y) { +; LMULMAX2-LABEL: fsub_v4f64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vfsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fsub_v4f64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fsub_v4f64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x double>, <4 x double>* %x + %b = load <4 x double>, <4 x double>* %y + %c = fsub <4 x double> %a, %b + store <4 x double> %c, <4 x double>* %x + ret void +} + +define void @fmul_v16f16(<16 x half>* %x, <16 x half>* %y) { +; LMULMAX2-LABEL: fmul_v16f16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vfmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fmul_v16f16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fmul_v16f16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x half>, <16 x half>* %x + %b = load <16 x half>, <16 x half>* %y + %c = fmul <16 x half> %a, %b + store <16 x half> %c, <16 x half>* %x + ret void +} + +define void @fmul_v8f32(<8 x float>* %x, <8 x float>* %y) { +; LMULMAX2-LABEL: fmul_v8f32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vfmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fmul_v8f32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fmul_v8f32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %b = load <8 x float>, <8 x float>* %y + %c = fmul <8 x float> %a, %b + store <8 x float> %c, <8 x float>* %x + ret void +} + +define void @fmul_v4f64(<4 x double>* %x, <4 x double>* %y) { +; LMULMAX2-LABEL: fmul_v4f64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vfmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fmul_v4f64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fmul_v4f64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x double>, <4 x double>* %x + %b = load <4 x double>, <4 x double>* %y + %c = fmul <4 x double> %a, %b + store <4 x double> %c, <4 x double>* %x + ret void +} + +define void @fdiv_v16f16(<16 x half>* %x, <16 x half>* %y) { +; LMULMAX2-LABEL: fdiv_v16f16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vfdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fdiv_v16f16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fdiv_v16f16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x half>, <16 x half>* %x + %b = load <16 x half>, <16 x half>* %y + %c = fdiv <16 x half> %a, %b + store <16 x half> %c, <16 x half>* %x + ret void +} + +define void @fdiv_v8f32(<8 x float>* %x, <8 x float>* %y) { +; LMULMAX2-LABEL: fdiv_v8f32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vfdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fdiv_v8f32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fdiv_v8f32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %b = load <8 x float>, <8 x float>* %y + %c = fdiv <8 x float> %a, %b + store <8 x float> %c, <8 x float>* %x + ret void +} + +define void @fdiv_v4f64(<4 x double>* %x, <4 x double>* %y) { +; LMULMAX2-LABEL: fdiv_v4f64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vfdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: fdiv_v4f64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vfdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vfdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: fdiv_v4f64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vfdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vfdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x double>, <4 x double>* %x + %b = load <4 x double>, <4 x double>* %y + %c = fdiv <4 x double> %a, %b + store <4 x double> %c, <4 x double>* %x + ret void +} + +define void @fneg_v16f16(<16 x half>* %x) { +; LMULMAX2-LABEL: fneg_v16f16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 16 +; LMULMAX2-NEXT: vsetvli a1, a1, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vfsgnjn.vv v26, v26, v26 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: fneg_v16f16: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, zero, 8 +; LMULMAX1-NEXT: vsetvli a1, a1, e16,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle16.v v25, (a1) +; LMULMAX1-NEXT: vle16.v v26, (a0) +; LMULMAX1-NEXT: vfsgnjn.vv v25, v25, v25 +; LMULMAX1-NEXT: vfsgnjn.vv v26, v26, v26 +; LMULMAX1-NEXT: vse16.v v26, (a0) +; LMULMAX1-NEXT: vse16.v v25, (a1) +; LMULMAX1-NEXT: ret + %a = load <16 x half>, <16 x half>* %x + %b = fneg <16 x half> %a + store <16 x half> %b, <16 x half>* %x + ret void +} + +define void @fneg_v8f32(<8 x float>* %x) { +; LMULMAX2-LABEL: fneg_v8f32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 8 +; LMULMAX2-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vfsgnjn.vv v26, v26, v26 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: fneg_v8f32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, zero, 4 +; LMULMAX1-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: vle32.v v26, (a0) +; LMULMAX1-NEXT: vfsgnjn.vv v25, v25, v25 +; LMULMAX1-NEXT: vfsgnjn.vv v26, v26, v26 +; LMULMAX1-NEXT: vse32.v v26, (a0) +; LMULMAX1-NEXT: vse32.v v25, (a1) +; LMULMAX1-NEXT: ret + %a = load <8 x float>, <8 x float>* %x + %b = fneg <8 x float> %a + store <8 x float> %b, <8 x float>* %x + ret void +} + +define void @fneg_v4f64(<4 x double>* %x) { +; LMULMAX2-LABEL: fneg_v4f64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 4 +; LMULMAX2-NEXT: vsetvli a1, a1, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vfsgnjn.vv v26, v26, v26 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: fneg_v4f64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, zero, 2 +; LMULMAX1-NEXT: vsetvli a1, a1, e64,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle64.v v25, (a1) +; LMULMAX1-NEXT: vle64.v v26, (a0) +; LMULMAX1-NEXT: vfsgnjn.vv v25, v25, v25 +; LMULMAX1-NEXT: vfsgnjn.vv v26, v26, v26 +; LMULMAX1-NEXT: vse64.v v26, (a0) +; LMULMAX1-NEXT: vse64.v v25, (a1) +; LMULMAX1-NEXT: ret + %a = load <4 x double>, <4 x double>* %x + %b = fneg <4 x double> %a + store <4 x double> %b, <4 x double>* %x + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -0,0 +1,3437 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1-RV64 + +define void @add_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: add_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = add <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @add_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: add_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = add <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @add_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: add_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = add <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @add_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: add_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vadd.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = add <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @sub_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: sub_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = sub <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @sub_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: sub_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = sub <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @sub_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: sub_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = sub <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @sub_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: sub_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = sub <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @mul_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: mul_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = mul <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @mul_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: mul_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = mul <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @mul_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: mul_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = mul <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @mul_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: mul_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vmul.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = mul <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @and_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: and_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = and <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @and_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: and_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = and <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @and_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: and_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = and <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @and_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: and_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vand.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = and <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @or_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: or_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = or <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @or_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: or_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = or <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @or_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: or_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = or <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @or_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: or_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vor.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = or <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @xor_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: xor_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = xor <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @xor_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: xor_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = xor <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @xor_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: xor_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = xor <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @xor_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: xor_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vxor.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = xor <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @lshr_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: lshr_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = lshr <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @lshr_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: lshr_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = lshr <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @lshr_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: lshr_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = lshr <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @lshr_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: lshr_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = lshr <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @ashr_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: ashr_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = ashr <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @ashr_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: ashr_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = ashr <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @ashr_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: ashr_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = ashr <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @ashr_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: ashr_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsra.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = ashr <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @shl_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: shl_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = shl <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @shl_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: shl_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = shl <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @shl_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: shl_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = shl <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @shl_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: shl_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vsll.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = shl <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @sdiv_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: sdiv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = sdiv <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @sdiv_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: sdiv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = sdiv <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @sdiv_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: sdiv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = sdiv <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @sdiv_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: sdiv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vdiv.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = sdiv <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @srem_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: srem_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = srem <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @srem_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: srem_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = srem <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @srem_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: srem_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = srem <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @srem_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: srem_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vrem.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = srem <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @udiv_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: udiv_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = udiv <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @udiv_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: udiv_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = udiv <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @udiv_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: udiv_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = udiv <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @udiv_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: udiv_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vdivu.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = udiv <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @urem_v16i8(<16 x i8>* %x, <16 x i8>* %y) { +; CHECK-LABEL: urem_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 16 +; CHECK-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v25, (a0) +; CHECK-NEXT: vle8.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse8.v v25, (a0) +; CHECK-NEXT: ret + %a = load <16 x i8>, <16 x i8>* %x + %b = load <16 x i8>, <16 x i8>* %y + %c = urem <16 x i8> %a, %b + store <16 x i8> %c, <16 x i8>* %x + ret void +} + +define void @urem_v8i16(<8 x i16>* %x, <8 x i16>* %y) { +; CHECK-LABEL: urem_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 8 +; CHECK-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vle16.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse16.v v25, (a0) +; CHECK-NEXT: ret + %a = load <8 x i16>, <8 x i16>* %x + %b = load <8 x i16>, <8 x i16>* %y + %c = urem <8 x i16> %a, %b + store <8 x i16> %c, <8 x i16>* %x + ret void +} + +define void @urem_v4i32(<4 x i32>* %x, <4 x i32>* %y) { +; CHECK-LABEL: urem_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 4 +; CHECK-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: vle32.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: ret + %a = load <4 x i32>, <4 x i32>* %x + %b = load <4 x i32>, <4 x i32>* %y + %c = urem <4 x i32> %a, %b + store <4 x i32> %c, <4 x i32>* %x + ret void +} + +define void @urem_v2i64(<2 x i64>* %x, <2 x i64>* %y) { +; CHECK-LABEL: urem_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a2, zero, 2 +; CHECK-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; CHECK-NEXT: vle64.v v25, (a0) +; CHECK-NEXT: vle64.v v26, (a1) +; CHECK-NEXT: vremu.vv v25, v25, v26 +; CHECK-NEXT: vse64.v v25, (a0) +; CHECK-NEXT: ret + %a = load <2 x i64>, <2 x i64>* %x + %b = load <2 x i64>, <2 x i64>* %y + %c = urem <2 x i64> %a, %b + store <2 x i64> %c, <2 x i64>* %x + ret void +} + +define void @add_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: add_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: add_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: add_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = add <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @add_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: add_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: add_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: add_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = add <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @add_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: add_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: add_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: add_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = add <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @add_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: add_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: add_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: add_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vadd.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = add <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @sub_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: sub_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sub_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sub_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = sub <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @sub_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: sub_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sub_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sub_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = sub <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @sub_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: sub_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sub_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sub_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = sub <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @sub_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: sub_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sub_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsub.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sub_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsub.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = sub <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @mul_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: mul_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mul_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mul_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = mul <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @mul_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: mul_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mul_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mul_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = mul <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @mul_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: mul_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mul_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mul_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = mul <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @mul_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: mul_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vmul.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: mul_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vmul.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: mul_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vmul.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vmul.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = mul <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @and_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: and_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vand.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: and_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: and_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vand.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = and <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @and_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: and_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vand.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: and_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: and_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vand.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = and <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @and_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: and_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vand.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: and_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: and_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vand.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = and <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @and_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: and_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vand.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: and_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vand.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: and_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vand.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = and <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @or_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: or_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: or_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: or_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = or <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @or_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: or_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: or_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: or_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = or <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @or_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: or_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: or_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: or_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = or <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @or_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: or_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: or_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: or_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = or <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @xor_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: xor_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vxor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: xor_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vxor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: xor_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vxor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = xor <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @xor_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: xor_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vxor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: xor_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vxor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: xor_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vxor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = xor <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @xor_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: xor_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vxor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: xor_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vxor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: xor_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vxor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = xor <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @xor_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: xor_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vxor.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: xor_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vxor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: xor_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vxor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vxor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = xor <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @lshr_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: lshr_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vsrl.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: lshr_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: lshr_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsrl.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = lshr <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @lshr_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: lshr_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vsrl.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: lshr_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: lshr_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsrl.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = lshr <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @lshr_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: lshr_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vsrl.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: lshr_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: lshr_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsrl.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = lshr <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @lshr_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: lshr_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vsrl.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: lshr_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: lshr_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsrl.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = lshr <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @ashr_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: ashr_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vsra.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ashr_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsra.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ashr_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsra.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = ashr <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @ashr_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: ashr_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vsra.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ashr_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsra.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ashr_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsra.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = ashr <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @ashr_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: ashr_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vsra.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ashr_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsra.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ashr_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsra.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = ashr <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @ashr_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: ashr_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vsra.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: ashr_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsra.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: ashr_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsra.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = ashr <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @shl_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: shl_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vsll.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: shl_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: shl_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsll.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = shl <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @shl_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: shl_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vsll.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: shl_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: shl_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsll.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = shl <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @shl_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: shl_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vsll.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: shl_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: shl_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsll.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = shl <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @shl_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: shl_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vsll.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: shl_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: shl_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vsll.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = shl <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @sdiv_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: sdiv_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sdiv_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sdiv_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = sdiv <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @sdiv_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: sdiv_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sdiv_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sdiv_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = sdiv <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @sdiv_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: sdiv_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sdiv_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sdiv_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = sdiv <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @sdiv_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: sdiv_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vdiv.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: sdiv_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdiv.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: sdiv_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = sdiv <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @srem_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: srem_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vrem.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: srem_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vrem.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: srem_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vrem.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = srem <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @srem_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: srem_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vrem.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: srem_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vrem.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: srem_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vrem.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = srem <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @srem_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: srem_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vrem.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: srem_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vrem.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: srem_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vrem.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = srem <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @srem_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: srem_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vrem.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: srem_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vrem.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: srem_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vrem.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vrem.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = srem <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @udiv_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: udiv_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vdivu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: udiv_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: udiv_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = udiv <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @udiv_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: udiv_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vdivu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: udiv_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: udiv_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = udiv <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @udiv_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: udiv_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vdivu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: udiv_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: udiv_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = udiv <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @udiv_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: udiv_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vdivu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: udiv_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vdivu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: udiv_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = udiv <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +} + +define void @urem_v32i8(<32 x i8>* %x, <32 x i8>* %y) { +; LMULMAX2-LABEL: urem_v32i8: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 32 +; LMULMAX2-NEXT: vsetvli a2, a2, e8,m2,ta,mu +; LMULMAX2-NEXT: vle8.v v26, (a0) +; LMULMAX2-NEXT: vle8.v v28, (a1) +; LMULMAX2-NEXT: vremu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse8.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: urem_v32i8: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 16 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle8.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV32-NEXT: vremu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: urem_v32i8: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 16 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e8,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle8.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle8.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle8.v v28, (a1) +; LMULMAX1-RV64-NEXT: vremu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse8.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse8.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <32 x i8>, <32 x i8>* %x + %b = load <32 x i8>, <32 x i8>* %y + %c = urem <32 x i8> %a, %b + store <32 x i8> %c, <32 x i8>* %x + ret void +} + +define void @urem_v16i16(<16 x i16>* %x, <16 x i16>* %y) { +; LMULMAX2-LABEL: urem_v16i16: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 16 +; LMULMAX2-NEXT: vsetvli a2, a2, e16,m2,ta,mu +; LMULMAX2-NEXT: vle16.v v26, (a0) +; LMULMAX2-NEXT: vle16.v v28, (a1) +; LMULMAX2-NEXT: vremu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse16.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: urem_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 8 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle16.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV32-NEXT: vremu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: urem_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 8 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle16.v v28, (a1) +; LMULMAX1-RV64-NEXT: vremu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <16 x i16>, <16 x i16>* %x + %b = load <16 x i16>, <16 x i16>* %y + %c = urem <16 x i16> %a, %b + store <16 x i16> %c, <16 x i16>* %x + ret void +} + +define void @urem_v8i32(<8 x i32>* %x, <8 x i32>* %y) { +; LMULMAX2-LABEL: urem_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 8 +; LMULMAX2-NEXT: vsetvli a2, a2, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: vremu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: urem_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 4 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle32.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vremu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: urem_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 4 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV64-NEXT: vremu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <8 x i32>, <8 x i32>* %x + %b = load <8 x i32>, <8 x i32>* %y + %c = urem <8 x i32> %a, %b + store <8 x i32> %c, <8 x i32>* %x + ret void +} + +define void @urem_v4i64(<4 x i64>* %x, <4 x i64>* %y) { +; LMULMAX2-LABEL: urem_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a2, zero, 4 +; LMULMAX2-NEXT: vsetvli a2, a2, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v26, (a0) +; LMULMAX2-NEXT: vle64.v v28, (a1) +; LMULMAX2-NEXT: vremu.vv v26, v26, v28 +; LMULMAX2-NEXT: vse64.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-RV32-LABEL: urem_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi a2, zero, 2 +; LMULMAX1-RV32-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a2, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV32-NEXT: addi a3, a1, 16 +; LMULMAX1-RV32-NEXT: vle64.v v27, (a3) +; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV32-NEXT: vremu.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: urem_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi a2, zero, 2 +; LMULMAX1-RV64-NEXT: vsetvli a2, a2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a2, a1, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v27, (a2) +; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX1-RV64-NEXT: vremu.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vremu.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) +; LMULMAX1-RV64-NEXT: ret + %a = load <4 x i64>, <4 x i64>* %x + %b = load <4 x i64>, <4 x i64>* %y + %c = urem <4 x i64> %a, %b + store <4 x i64> %c, <4 x i64>* %x + ret void +}