diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -145,7 +145,7 @@ auto addRegClassForFixedVectors = [this](MVT VT) { unsigned LMul = Subtarget.getLMULForFixedLengthVector(VT); const TargetRegisterClass *RC; - if (LMul == 1) + if (LMul == 1 || VT.getVectorElementType() == MVT::i1) RC = &RISCV::VRRegClass; else if (LMul == 2) RC = &RISCV::VRM2RegClass; @@ -4939,8 +4939,8 @@ MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; // Any return value split in to more than two values can't be returned - // directly. - if (IsRet && ValNo > 1) + // directly. Vectors are returned via the available vector registers. + if (!LocVT.isVector() && IsRet && ValNo > 1) return true; // UseGPRForF16_F32 if targeting one of the soft-float ABIs, if passing a @@ -5031,9 +5031,15 @@ return false; } + // Fixed-length vectors are located in the corresponding scalable-vector + // container types. + if (ValVT.isFixedLengthVector()) + LocVT = TLI.getContainerForFixedLengthVector(LocVT); + // Split arguments might be passed indirectly, so keep track of the pending - // values. - if (ArgFlags.isSplit() || !PendingLocs.empty()) { + // values. Split vectors are passed via a mix of registers and indirectly, so + // treat them as we would any other argument. + if (!LocVT.isVector() && (ArgFlags.isSplit() || !PendingLocs.empty())) { LocVT = XLenVT; LocInfo = CCValAssign::Indirect; PendingLocs.push_back( @@ -5046,7 +5052,7 @@ // If the split argument only had two elements, it should be passed directly // in registers or on the stack. - if (ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) { + if (!LocVT.isVector() && ArgFlags.isSplitEnd() && PendingLocs.size() <= 2) { assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()"); // Apply the normal calling convention rules to the first half of the // split argument. @@ -5066,7 +5072,7 @@ Reg = State.AllocateReg(ArgFPR32s); else if (ValVT == MVT::f64 && !UseGPRForF64) Reg = State.AllocateReg(ArgFPR64s); - else if (ValVT.isScalableVector()) { + else if (ValVT.isVector()) { const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); if (RC == &RISCV::VRRegClass) { // Assign the first mask argument to V0. @@ -5088,6 +5094,12 @@ llvm_unreachable("Unhandled class register for ValueType"); } if (!Reg) { + // For return values, the vector must be passed fully via registers or + // via the stack. + // FIXME: The proposed vector ABI only mandates v8-v15 for return values, + // but we're using all of them. + if (IsRet) + return true; LocInfo = CCValAssign::Indirect; // Try using a GPR to pass the address Reg = State.AllocateReg(ArgGPRs); @@ -5117,8 +5129,8 @@ } assert((!UseGPRForF16_F32 || !UseGPRForF64 || LocVT == XLenVT || - (TLI.getSubtarget().hasStdExtV() && ValVT.isScalableVector())) && - "Expected an XLenVT or scalable vector types at this stage"); + (TLI.getSubtarget().hasStdExtV() && ValVT.isVector())) && + "Expected an XLenVT or vector types at this stage"); if (Reg) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); @@ -5139,8 +5151,7 @@ static Optional preAssignMask(const ArgTy &Args) { for (const auto &ArgIdx : enumerate(Args)) { MVT ArgVT = ArgIdx.value().VT; - if (ArgVT.isScalableVector() && - ArgVT.getVectorElementType().SimpleTy == MVT::i1) + if (ArgVT.isVector() && ArgVT.getVectorElementType() == MVT::i1) return ArgIdx.index(); } return None; @@ -5206,11 +5217,14 @@ // Convert Val to a ValVT. Should not be called for CCValAssign::Indirect // values. static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, - const CCValAssign &VA, const SDLoc &DL) { + const CCValAssign &VA, const SDLoc &DL, + const RISCVSubtarget &Subtarget) { switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: + if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector()) + Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget); break; case CCValAssign::BCvt: if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16) @@ -5241,17 +5255,20 @@ if (VA.getLocInfo() == CCValAssign::Indirect) return Val; - return convertLocVTToValVT(DAG, Val, VA, DL); + return convertLocVTToValVT(DAG, Val, VA, DL, TLI.getSubtarget()); } static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, - const CCValAssign &VA, const SDLoc &DL) { + const CCValAssign &VA, const SDLoc &DL, + const RISCVSubtarget &Subtarget) { EVT LocVT = VA.getLocVT(); switch (VA.getLocInfo()) { default: llvm_unreachable("Unexpected CCValAssign::LocInfo"); case CCValAssign::Full: + if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector()) + Val = convertToScalableVector(LocVT, Val, DAG, Subtarget); break; case CCValAssign::BCvt: if (VA.getLocVT().isInteger() && VA.getValVT() == MVT::f16) @@ -5512,14 +5529,17 @@ if (VA.getLocInfo() == CCValAssign::Indirect) { // If the original argument was split and passed by reference (e.g. i128 // on RV32), we need to load all parts of it here (using the same - // address). + // address). Vectors may be partly split to registers and partly to the + // stack, in which case the base address is partly offset and subsequent + // stores are relative to that. InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo())); unsigned ArgIndex = Ins[i].OrigArgIndex; - assert(Ins[i].PartOffset == 0); + unsigned ArgPartOffset = Ins[i].PartOffset; + assert(VA.getValVT().isVector() || ArgPartOffset == 0); while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) { CCValAssign &PartVA = ArgLocs[i + 1]; - unsigned PartOffset = Ins[i + 1].PartOffset; + unsigned PartOffset = Ins[i + 1].PartOffset - ArgPartOffset; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, DAG.getIntPtrConstant(PartOffset, DL)); InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address, @@ -5789,12 +5809,16 @@ DAG.getStore(Chain, DL, ArgValue, SpillSlot, MachinePointerInfo::getFixedStack(MF, FI))); // If the original argument was split (e.g. i128), we need - // to store all parts of it here (and pass just one address). + // to store the required parts of it here (and pass just one address). + // Vectors may be partly split to registers and partly to the stack, in + // which case the base address is partly offset and subsequent stores are + // relative to that. unsigned ArgIndex = Outs[i].OrigArgIndex; - assert(Outs[i].PartOffset == 0); + unsigned ArgPartOffset = Outs[i].PartOffset; + assert(VA.getValVT().isVector() || ArgPartOffset == 0); while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) { SDValue PartValue = OutVals[i + 1]; - unsigned PartOffset = Outs[i + 1].PartOffset; + unsigned PartOffset = Outs[i + 1].PartOffset - ArgPartOffset; SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, DAG.getIntPtrConstant(PartOffset, DL)); MemOpChains.push_back( @@ -5804,7 +5828,7 @@ } ArgValue = SpillSlot; } else { - ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL); + ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL, Subtarget); } // Use local copy if it is a byval arg. @@ -5940,7 +5964,7 @@ RetValue2); } - RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); + RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL, Subtarget); InVals.push_back(RetValue); } @@ -6026,7 +6050,7 @@ RetOps.push_back(DAG.getRegister(RegHi, MVT::i32)); } else { // Handle a 'normal' return. - Val = convertValVTToLocVT(DAG, Val, VA, DL); + Val = convertValVTToLocVT(DAG, Val, VA, DL, Subtarget); Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue); if (STI.isRegisterReservedByUser(VA.getLocReg())) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv.ll @@ -0,0 +1,1170 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=4 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX4 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 + +define <4 x i8> @ret_v4i8(<4 x i8>* %p) { +; CHECK-LABEL: ret_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e8,m1,ta,mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %v = load <4 x i8>, <4 x i8>* %p + ret <4 x i8> %v +} + +define <4 x i32> @ret_v4i32(<4 x i32>* %p) { +; CHECK-LABEL: ret_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %p + ret <4 x i32> %v +} + +define <8 x i32> @ret_v8i32(<8 x i32>* %p) { +; LMULMAX8-LABEL: ret_v8i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v8i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v8i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v8i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a0) +; LMULMAX1-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %p + ret <8 x i32> %v +} + +define <16 x i64> @ret_v16i64(<16 x i64>* %p) { +; LMULMAX8-LABEL: ret_v16i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a1, 16, e64,m8,ta,mu +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v16i64: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; LMULMAX4-NEXT: vle64.v v8, (a0) +; LMULMAX4-NEXT: addi a0, a0, 64 +; LMULMAX4-NEXT: vle64.v v12, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v16i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-NEXT: vle64.v v8, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle64.v v10, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle64.v v12, (a1) +; LMULMAX2-NEXT: addi a0, a0, 96 +; LMULMAX2-NEXT: vle64.v v14, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v16i64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-NEXT: vle64.v v8, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle64.v v9, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle64.v v10, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle64.v v11, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle64.v v12, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle64.v v13, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle64.v v14, (a1) +; LMULMAX1-NEXT: addi a0, a0, 112 +; LMULMAX1-NEXT: vle64.v v15, (a0) +; LMULMAX1-NEXT: ret + %v = load <16 x i64>, <16 x i64>* %p + ret <16 x i64> %v +} + +define <8 x i1> @ret_mask_v8i1(<8 x i1>* %p) { +; CHECK-LABEL: ret_mask_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: ret + %v = load <8 x i1>, <8 x i1>* %p + ret <8 x i1> %v +} + +define <32 x i1> @ret_mask_v32i1(<32 x i1>* %p) { +; LMULMAX8-LABEL: ret_mask_v32i1: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX8-NEXT: vle1.v v0, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_mask_v32i1: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a1, zero, 32 +; LMULMAX4-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX4-NEXT: vle1.v v0, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_mask_v32i1: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, zero, 32 +; LMULMAX2-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; LMULMAX2-NEXT: vle1.v v0, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_mask_v32i1: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: vle1.v v0, (a0) +; LMULMAX1-NEXT: addi a0, a0, 2 +; LMULMAX1-NEXT: vle1.v v8, (a0) +; LMULMAX1-NEXT: ret + %v = load <32 x i1>, <32 x i1>* %p + ret <32 x i1> %v +} + +; Return the vector via registers v8-v23 +define <64 x i32> @ret_split_v64i32(<64 x i32>* %x) { +; LMULMAX8-LABEL: ret_split_v64i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: addi a0, a0, 128 +; LMULMAX8-NEXT: vle32.v v16, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_split_v64i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vle32.v v12, (a1) +; LMULMAX4-NEXT: addi a1, a0, 128 +; LMULMAX4-NEXT: vle32.v v16, (a1) +; LMULMAX4-NEXT: addi a0, a0, 192 +; LMULMAX4-NEXT: vle32.v v20, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_split_v64i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle32.v v10, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle32.v v12, (a1) +; LMULMAX2-NEXT: addi a1, a0, 96 +; LMULMAX2-NEXT: vle32.v v14, (a1) +; LMULMAX2-NEXT: addi a1, a0, 128 +; LMULMAX2-NEXT: vle32.v v16, (a1) +; LMULMAX2-NEXT: addi a1, a0, 160 +; LMULMAX2-NEXT: vle32.v v18, (a1) +; LMULMAX2-NEXT: addi a1, a0, 192 +; LMULMAX2-NEXT: vle32.v v20, (a1) +; LMULMAX2-NEXT: addi a0, a0, 224 +; LMULMAX2-NEXT: vle32.v v22, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_split_v64i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v9, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v10, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v11, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle32.v v13, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle32.v v14, (a1) +; LMULMAX1-NEXT: addi a1, a0, 112 +; LMULMAX1-NEXT: vle32.v v15, (a1) +; LMULMAX1-NEXT: addi a1, a0, 128 +; LMULMAX1-NEXT: vle32.v v16, (a1) +; LMULMAX1-NEXT: addi a1, a0, 144 +; LMULMAX1-NEXT: vle32.v v17, (a1) +; LMULMAX1-NEXT: addi a1, a0, 160 +; LMULMAX1-NEXT: vle32.v v18, (a1) +; LMULMAX1-NEXT: addi a1, a0, 176 +; LMULMAX1-NEXT: vle32.v v19, (a1) +; LMULMAX1-NEXT: addi a1, a0, 192 +; LMULMAX1-NEXT: vle32.v v20, (a1) +; LMULMAX1-NEXT: addi a1, a0, 208 +; LMULMAX1-NEXT: vle32.v v21, (a1) +; LMULMAX1-NEXT: addi a1, a0, 224 +; LMULMAX1-NEXT: vle32.v v22, (a1) +; LMULMAX1-NEXT: addi a0, a0, 240 +; LMULMAX1-NEXT: vle32.v v23, (a0) +; LMULMAX1-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + ret <64 x i32> %v +} + +; Return the vector fully via the stack +define <128 x i32> @ret_split_v128i32(<128 x i32>* %x) { +; LMULMAX8-LABEL: ret_split_v128i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a1) +; LMULMAX8-NEXT: addi a2, a1, 128 +; LMULMAX8-NEXT: vle32.v v16, (a2) +; LMULMAX8-NEXT: addi a2, a1, 384 +; LMULMAX8-NEXT: vle32.v v24, (a2) +; LMULMAX8-NEXT: addi a1, a1, 256 +; LMULMAX8-NEXT: vle32.v v0, (a1) +; LMULMAX8-NEXT: addi a1, a0, 384 +; LMULMAX8-NEXT: vse32.v v24, (a1) +; LMULMAX8-NEXT: addi a1, a0, 256 +; LMULMAX8-NEXT: vse32.v v0, (a1) +; LMULMAX8-NEXT: addi a1, a0, 128 +; LMULMAX8-NEXT: vse32.v v16, (a1) +; LMULMAX8-NEXT: vse32.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_split_v128i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a2, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a1) +; LMULMAX4-NEXT: addi a2, a1, 64 +; LMULMAX4-NEXT: vle32.v v8, (a2) +; LMULMAX4-NEXT: addi a2, a1, 128 +; LMULMAX4-NEXT: vle32.v v12, (a2) +; LMULMAX4-NEXT: addi a2, a1, 192 +; LMULMAX4-NEXT: vle32.v v16, (a2) +; LMULMAX4-NEXT: addi a2, a1, 256 +; LMULMAX4-NEXT: vle32.v v20, (a2) +; LMULMAX4-NEXT: addi a2, a1, 320 +; LMULMAX4-NEXT: vle32.v v24, (a2) +; LMULMAX4-NEXT: addi a2, a1, 448 +; LMULMAX4-NEXT: vle32.v v0, (a2) +; LMULMAX4-NEXT: addi a1, a1, 384 +; LMULMAX4-NEXT: vle32.v v4, (a1) +; LMULMAX4-NEXT: addi a1, a0, 448 +; LMULMAX4-NEXT: vse32.v v0, (a1) +; LMULMAX4-NEXT: addi a1, a0, 384 +; LMULMAX4-NEXT: vse32.v v4, (a1) +; LMULMAX4-NEXT: addi a1, a0, 320 +; LMULMAX4-NEXT: vse32.v v24, (a1) +; LMULMAX4-NEXT: addi a1, a0, 256 +; LMULMAX4-NEXT: vse32.v v20, (a1) +; LMULMAX4-NEXT: addi a1, a0, 192 +; LMULMAX4-NEXT: vse32.v v16, (a1) +; LMULMAX4-NEXT: addi a1, a0, 128 +; LMULMAX4-NEXT: vse32.v v12, (a1) +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vse32.v v8, (a1) +; LMULMAX4-NEXT: vse32.v v28, (a0) +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_split_v128i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a1) +; LMULMAX2-NEXT: addi a2, a1, 32 +; LMULMAX2-NEXT: vle32.v v28, (a2) +; LMULMAX2-NEXT: addi a2, a1, 64 +; LMULMAX2-NEXT: vle32.v v30, (a2) +; LMULMAX2-NEXT: addi a2, a1, 96 +; LMULMAX2-NEXT: vle32.v v8, (a2) +; LMULMAX2-NEXT: addi a2, a1, 128 +; LMULMAX2-NEXT: vle32.v v10, (a2) +; LMULMAX2-NEXT: addi a2, a1, 160 +; LMULMAX2-NEXT: vle32.v v12, (a2) +; LMULMAX2-NEXT: addi a2, a1, 192 +; LMULMAX2-NEXT: vle32.v v14, (a2) +; LMULMAX2-NEXT: addi a2, a1, 224 +; LMULMAX2-NEXT: vle32.v v16, (a2) +; LMULMAX2-NEXT: addi a2, a1, 256 +; LMULMAX2-NEXT: vle32.v v18, (a2) +; LMULMAX2-NEXT: addi a2, a1, 288 +; LMULMAX2-NEXT: vle32.v v20, (a2) +; LMULMAX2-NEXT: addi a2, a1, 320 +; LMULMAX2-NEXT: vle32.v v22, (a2) +; LMULMAX2-NEXT: addi a2, a1, 352 +; LMULMAX2-NEXT: vle32.v v24, (a2) +; LMULMAX2-NEXT: addi a2, a1, 384 +; LMULMAX2-NEXT: vle32.v v0, (a2) +; LMULMAX2-NEXT: addi a2, a1, 416 +; LMULMAX2-NEXT: vle32.v v2, (a2) +; LMULMAX2-NEXT: addi a2, a1, 480 +; LMULMAX2-NEXT: vle32.v v4, (a2) +; LMULMAX2-NEXT: addi a1, a1, 448 +; LMULMAX2-NEXT: vle32.v v6, (a1) +; LMULMAX2-NEXT: addi a1, a0, 480 +; LMULMAX2-NEXT: vse32.v v4, (a1) +; LMULMAX2-NEXT: addi a1, a0, 448 +; LMULMAX2-NEXT: vse32.v v6, (a1) +; LMULMAX2-NEXT: addi a1, a0, 416 +; LMULMAX2-NEXT: vse32.v v2, (a1) +; LMULMAX2-NEXT: addi a1, a0, 384 +; LMULMAX2-NEXT: vse32.v v0, (a1) +; LMULMAX2-NEXT: addi a1, a0, 352 +; LMULMAX2-NEXT: vse32.v v24, (a1) +; LMULMAX2-NEXT: addi a1, a0, 320 +; LMULMAX2-NEXT: vse32.v v22, (a1) +; LMULMAX2-NEXT: addi a1, a0, 288 +; LMULMAX2-NEXT: vse32.v v20, (a1) +; LMULMAX2-NEXT: addi a1, a0, 256 +; LMULMAX2-NEXT: vse32.v v18, (a1) +; LMULMAX2-NEXT: addi a1, a0, 224 +; LMULMAX2-NEXT: vse32.v v16, (a1) +; LMULMAX2-NEXT: addi a1, a0, 192 +; LMULMAX2-NEXT: vse32.v v14, (a1) +; LMULMAX2-NEXT: addi a1, a0, 160 +; LMULMAX2-NEXT: vse32.v v12, (a1) +; LMULMAX2-NEXT: addi a1, a0, 128 +; LMULMAX2-NEXT: vse32.v v10, (a1) +; LMULMAX2-NEXT: addi a1, a0, 96 +; LMULMAX2-NEXT: vse32.v v8, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vse32.v v30, (a1) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vse32.v v28, (a1) +; LMULMAX2-NEXT: vse32.v v26, (a0) +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_split_v128i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: addi a2, a1, 16 +; LMULMAX1-NEXT: vle32.v v26, (a2) +; LMULMAX1-NEXT: addi a2, a1, 32 +; LMULMAX1-NEXT: vle32.v v27, (a2) +; LMULMAX1-NEXT: addi a2, a1, 48 +; LMULMAX1-NEXT: vle32.v v28, (a2) +; LMULMAX1-NEXT: addi a2, a1, 64 +; LMULMAX1-NEXT: vle32.v v29, (a2) +; LMULMAX1-NEXT: addi a2, a1, 80 +; LMULMAX1-NEXT: vle32.v v30, (a2) +; LMULMAX1-NEXT: addi a2, a1, 96 +; LMULMAX1-NEXT: vle32.v v31, (a2) +; LMULMAX1-NEXT: addi a2, a1, 112 +; LMULMAX1-NEXT: vle32.v v8, (a2) +; LMULMAX1-NEXT: addi a2, a1, 128 +; LMULMAX1-NEXT: vle32.v v9, (a2) +; LMULMAX1-NEXT: addi a2, a1, 144 +; LMULMAX1-NEXT: vle32.v v10, (a2) +; LMULMAX1-NEXT: addi a2, a1, 160 +; LMULMAX1-NEXT: vle32.v v11, (a2) +; LMULMAX1-NEXT: addi a2, a1, 176 +; LMULMAX1-NEXT: vle32.v v12, (a2) +; LMULMAX1-NEXT: addi a2, a1, 192 +; LMULMAX1-NEXT: vle32.v v13, (a2) +; LMULMAX1-NEXT: addi a2, a1, 208 +; LMULMAX1-NEXT: vle32.v v14, (a2) +; LMULMAX1-NEXT: addi a2, a1, 224 +; LMULMAX1-NEXT: vle32.v v15, (a2) +; LMULMAX1-NEXT: addi a2, a1, 240 +; LMULMAX1-NEXT: vle32.v v16, (a2) +; LMULMAX1-NEXT: addi a2, a1, 256 +; LMULMAX1-NEXT: vle32.v v17, (a2) +; LMULMAX1-NEXT: addi a2, a1, 272 +; LMULMAX1-NEXT: vle32.v v18, (a2) +; LMULMAX1-NEXT: addi a2, a1, 288 +; LMULMAX1-NEXT: vle32.v v19, (a2) +; LMULMAX1-NEXT: addi a2, a1, 304 +; LMULMAX1-NEXT: vle32.v v20, (a2) +; LMULMAX1-NEXT: addi a2, a1, 320 +; LMULMAX1-NEXT: vle32.v v21, (a2) +; LMULMAX1-NEXT: addi a2, a1, 336 +; LMULMAX1-NEXT: vle32.v v22, (a2) +; LMULMAX1-NEXT: addi a2, a1, 352 +; LMULMAX1-NEXT: vle32.v v23, (a2) +; LMULMAX1-NEXT: addi a2, a1, 368 +; LMULMAX1-NEXT: vle32.v v24, (a2) +; LMULMAX1-NEXT: addi a2, a1, 384 +; LMULMAX1-NEXT: vle32.v v0, (a2) +; LMULMAX1-NEXT: addi a2, a1, 400 +; LMULMAX1-NEXT: vle32.v v1, (a2) +; LMULMAX1-NEXT: addi a2, a1, 416 +; LMULMAX1-NEXT: vle32.v v2, (a2) +; LMULMAX1-NEXT: addi a2, a1, 432 +; LMULMAX1-NEXT: vle32.v v3, (a2) +; LMULMAX1-NEXT: addi a2, a1, 448 +; LMULMAX1-NEXT: vle32.v v4, (a2) +; LMULMAX1-NEXT: addi a2, a1, 464 +; LMULMAX1-NEXT: vle32.v v5, (a2) +; LMULMAX1-NEXT: addi a2, a1, 496 +; LMULMAX1-NEXT: vle32.v v6, (a2) +; LMULMAX1-NEXT: addi a1, a1, 480 +; LMULMAX1-NEXT: vle32.v v7, (a1) +; LMULMAX1-NEXT: addi a1, a0, 496 +; LMULMAX1-NEXT: vse32.v v6, (a1) +; LMULMAX1-NEXT: addi a1, a0, 480 +; LMULMAX1-NEXT: vse32.v v7, (a1) +; LMULMAX1-NEXT: addi a1, a0, 464 +; LMULMAX1-NEXT: vse32.v v5, (a1) +; LMULMAX1-NEXT: addi a1, a0, 448 +; LMULMAX1-NEXT: vse32.v v4, (a1) +; LMULMAX1-NEXT: addi a1, a0, 432 +; LMULMAX1-NEXT: vse32.v v3, (a1) +; LMULMAX1-NEXT: addi a1, a0, 416 +; LMULMAX1-NEXT: vse32.v v2, (a1) +; LMULMAX1-NEXT: addi a1, a0, 400 +; LMULMAX1-NEXT: vse32.v v1, (a1) +; LMULMAX1-NEXT: addi a1, a0, 384 +; LMULMAX1-NEXT: vse32.v v0, (a1) +; LMULMAX1-NEXT: addi a1, a0, 368 +; LMULMAX1-NEXT: vse32.v v24, (a1) +; LMULMAX1-NEXT: addi a1, a0, 352 +; LMULMAX1-NEXT: vse32.v v23, (a1) +; LMULMAX1-NEXT: addi a1, a0, 336 +; LMULMAX1-NEXT: vse32.v v22, (a1) +; LMULMAX1-NEXT: addi a1, a0, 320 +; LMULMAX1-NEXT: vse32.v v21, (a1) +; LMULMAX1-NEXT: addi a1, a0, 304 +; LMULMAX1-NEXT: vse32.v v20, (a1) +; LMULMAX1-NEXT: addi a1, a0, 288 +; LMULMAX1-NEXT: vse32.v v19, (a1) +; LMULMAX1-NEXT: addi a1, a0, 272 +; LMULMAX1-NEXT: vse32.v v18, (a1) +; LMULMAX1-NEXT: addi a1, a0, 256 +; LMULMAX1-NEXT: vse32.v v17, (a1) +; LMULMAX1-NEXT: addi a1, a0, 240 +; LMULMAX1-NEXT: vse32.v v16, (a1) +; LMULMAX1-NEXT: addi a1, a0, 224 +; LMULMAX1-NEXT: vse32.v v15, (a1) +; LMULMAX1-NEXT: addi a1, a0, 208 +; LMULMAX1-NEXT: vse32.v v14, (a1) +; LMULMAX1-NEXT: addi a1, a0, 192 +; LMULMAX1-NEXT: vse32.v v13, (a1) +; LMULMAX1-NEXT: addi a1, a0, 176 +; LMULMAX1-NEXT: vse32.v v12, (a1) +; LMULMAX1-NEXT: addi a1, a0, 160 +; LMULMAX1-NEXT: vse32.v v11, (a1) +; LMULMAX1-NEXT: addi a1, a0, 144 +; LMULMAX1-NEXT: vse32.v v10, (a1) +; LMULMAX1-NEXT: addi a1, a0, 128 +; LMULMAX1-NEXT: vse32.v v9, (a1) +; LMULMAX1-NEXT: addi a1, a0, 112 +; LMULMAX1-NEXT: vse32.v v8, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vse32.v v31, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vse32.v v30, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vse32.v v29, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vse32.v v28, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vse32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vse32.v v26, (a1) +; LMULMAX1-NEXT: vse32.v v25, (a0) +; LMULMAX1-NEXT: ret + %v = load <128 x i32>, <128 x i32>* %x + ret <128 x i32> %v +} + +define <4 x i8> @ret_v8i8_param_v4i8(<4 x i8> %v) { +; CHECK-LABEL: ret_v8i8_param_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vadd.vi v8, v8, 2 +; CHECK-NEXT: ret + %r = add <4 x i8> %v, + ret <4 x i8> %r +} + +define <4 x i8> @ret_v4i8_param_v4i8_v4i8(<4 x i8> %v, <4 x i8> %w) { +; CHECK-LABEL: ret_v4i8_param_v4i8_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = add <4 x i8> %v, %w + ret <4 x i8> %r +} + +define <4 x i64> @ret_v4i64_param_v4i64_v4i64(<4 x i64> %v, <4 x i64> %w) { +; LMULMAX8-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; LMULMAX8-NEXT: vadd.vv v8, v8, v10 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; LMULMAX4-NEXT: vadd.vv v8, v8, v10 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; LMULMAX2-NEXT: vadd.vv v8, v8, v10 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v4i64_param_v4i64_v4i64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a0, 2, e64,m1,ta,mu +; LMULMAX1-NEXT: vadd.vv v8, v8, v10 +; LMULMAX1-NEXT: vadd.vv v9, v9, v11 +; LMULMAX1-NEXT: ret + %r = add <4 x i64> %v, %w + ret <4 x i64> %r +} + +define <8 x i1> @ret_v8i1_param_v8i1_v8i1(<8 x i1> %v, <8 x i1> %w) { +; CHECK-LABEL: ret_v8i1_param_v8i1_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 8, e8,m1,ta,mu +; CHECK-NEXT: vmxor.mm v0, v0, v8 +; CHECK-NEXT: ret + %r = xor <8 x i1> %v, %w + ret <8 x i1> %r +} + +define <32 x i1> @ret_v32i1_param_v32i1_v32i1(<32 x i1> %v, <32 x i1> %w) { +; LMULMAX8-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; LMULMAX8-NEXT: vmand.mm v0, v0, v8 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a0, zero, 32 +; LMULMAX4-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; LMULMAX4-NEXT: vmand.mm v0, v0, v8 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a0, zero, 32 +; LMULMAX2-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; LMULMAX2-NEXT: vmand.mm v0, v0, v8 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i1_param_v32i1_v32i1: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a0, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: vmand.mm v0, v0, v9 +; LMULMAX1-NEXT: vmand.mm v8, v8, v10 +; LMULMAX1-NEXT: ret + %r = and <32 x i1> %v, %w + ret <32 x i1> %r +} + +define <32 x i32> @ret_v32i32_param_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: vadd.vv v8, v8, v24 +; LMULMAX8-NEXT: vadd.vx v8, v8, a1 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vle32.v v28, (a1) +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: vadd.vv v8, v8, v16 +; LMULMAX4-NEXT: vadd.vv v12, v12, v20 +; LMULMAX4-NEXT: vadd.vv v28, v12, v28 +; LMULMAX4-NEXT: vadd.vv v8, v8, v24 +; LMULMAX4-NEXT: vadd.vx v8, v8, a2 +; LMULMAX4-NEXT: vadd.vx v12, v28, a2 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: addi a0, a0, 96 +; LMULMAX2-NEXT: vle32.v v24, (a0) +; LMULMAX2-NEXT: vadd.vv v8, v8, v16 +; LMULMAX2-NEXT: vadd.vv v10, v10, v18 +; LMULMAX2-NEXT: vadd.vv v12, v12, v20 +; LMULMAX2-NEXT: vadd.vv v14, v14, v22 +; LMULMAX2-NEXT: vadd.vv v14, v14, v24 +; LMULMAX2-NEXT: vadd.vv v30, v12, v30 +; LMULMAX2-NEXT: vadd.vv v28, v10, v28 +; LMULMAX2-NEXT: vadd.vv v26, v8, v26 +; LMULMAX2-NEXT: vadd.vx v8, v26, a4 +; LMULMAX2-NEXT: vadd.vx v10, v28, a4 +; LMULMAX2-NEXT: vadd.vx v12, v30, a4 +; LMULMAX2-NEXT: vadd.vx v14, v14, a4 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v28, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle32.v v29, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle32.v v30, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle32.v v31, (a1) +; LMULMAX1-NEXT: addi a0, a0, 112 +; LMULMAX1-NEXT: vle32.v v24, (a0) +; LMULMAX1-NEXT: lw a0, 0(sp) +; LMULMAX1-NEXT: vadd.vv v8, v8, v16 +; LMULMAX1-NEXT: vadd.vv v9, v9, v17 +; LMULMAX1-NEXT: vadd.vv v10, v10, v18 +; LMULMAX1-NEXT: vadd.vv v11, v11, v19 +; LMULMAX1-NEXT: vadd.vv v12, v12, v20 +; LMULMAX1-NEXT: vadd.vv v13, v13, v21 +; LMULMAX1-NEXT: vadd.vv v14, v14, v22 +; LMULMAX1-NEXT: vadd.vv v15, v15, v23 +; LMULMAX1-NEXT: vadd.vv v15, v15, v24 +; LMULMAX1-NEXT: vadd.vv v31, v14, v31 +; LMULMAX1-NEXT: vadd.vv v30, v13, v30 +; LMULMAX1-NEXT: vadd.vv v29, v12, v29 +; LMULMAX1-NEXT: vadd.vv v28, v11, v28 +; LMULMAX1-NEXT: vadd.vv v27, v10, v27 +; LMULMAX1-NEXT: vadd.vv v26, v9, v26 +; LMULMAX1-NEXT: vadd.vv v25, v8, v25 +; LMULMAX1-NEXT: vadd.vx v8, v25, a0 +; LMULMAX1-NEXT: vadd.vx v9, v26, a0 +; LMULMAX1-NEXT: vadd.vx v10, v27, a0 +; LMULMAX1-NEXT: vadd.vx v11, v28, a0 +; LMULMAX1-NEXT: vadd.vx v12, v29, a0 +; LMULMAX1-NEXT: vadd.vx v13, v30, a0 +; LMULMAX1-NEXT: vadd.vx v14, v31, a0 +; LMULMAX1-NEXT: vadd.vx v15, v15, a0 +; LMULMAX1-NEXT: ret + %r = add <32 x i32> %x, %y + %s = add <32 x i32> %r, %z + %head = insertelement <32 x i32> undef, i32 %w, i32 0 + %splat = shufflevector <32 x i32> %head, <32 x i32> undef, <32 x i32> zeroinitializer + %t = add <32 x i32> %s, %splat + ret <32 x i32> %t +} + +declare <32 x i32> @ext2(<32 x i32>, <32 x i32>, i32, i32) +declare <32 x i32> @ext3(<32 x i32>, <32 x i32>, <32 x i32>, i32, i32) + +define <32 x i32> @ret_v32i32_call_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -16 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX8-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: vmv8r.v v24, v8 +; LMULMAX8-NEXT: addi a1, zero, 2 +; LMULMAX8-NEXT: vmv8r.v v8, v16 +; LMULMAX8-NEXT: vmv8r.v v16, v24 +; LMULMAX8-NEXT: call ext2@plt +; LMULMAX8-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 16 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -16 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX4-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: vmv4r.v v28, v12 +; LMULMAX4-NEXT: vmv4r.v v24, v8 +; LMULMAX4-NEXT: addi a1, zero, 2 +; LMULMAX4-NEXT: vmv4r.v v8, v16 +; LMULMAX4-NEXT: vmv4r.v v12, v20 +; LMULMAX4-NEXT: vmv4r.v v16, v24 +; LMULMAX4-NEXT: vmv4r.v v20, v28 +; LMULMAX4-NEXT: call ext2@plt +; LMULMAX4-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 16 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -16 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: vmv2r.v v26, v14 +; LMULMAX2-NEXT: vmv2r.v v28, v12 +; LMULMAX2-NEXT: vmv2r.v v30, v10 +; LMULMAX2-NEXT: vmv2r.v v24, v8 +; LMULMAX2-NEXT: addi a1, zero, 2 +; LMULMAX2-NEXT: vmv2r.v v8, v16 +; LMULMAX2-NEXT: vmv2r.v v10, v18 +; LMULMAX2-NEXT: vmv2r.v v12, v20 +; LMULMAX2-NEXT: vmv2r.v v14, v22 +; LMULMAX2-NEXT: vmv2r.v v16, v24 +; LMULMAX2-NEXT: vmv2r.v v18, v30 +; LMULMAX2-NEXT: vmv2r.v v20, v28 +; LMULMAX2-NEXT: vmv2r.v v22, v26 +; LMULMAX2-NEXT: call ext2@plt +; LMULMAX2-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 16 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -16 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: vmv1r.v v25, v15 +; LMULMAX1-NEXT: vmv1r.v v26, v14 +; LMULMAX1-NEXT: vmv1r.v v27, v13 +; LMULMAX1-NEXT: vmv1r.v v28, v12 +; LMULMAX1-NEXT: vmv1r.v v29, v11 +; LMULMAX1-NEXT: vmv1r.v v30, v10 +; LMULMAX1-NEXT: vmv1r.v v31, v9 +; LMULMAX1-NEXT: vmv1r.v v24, v8 +; LMULMAX1-NEXT: addi a1, zero, 2 +; LMULMAX1-NEXT: vmv1r.v v8, v16 +; LMULMAX1-NEXT: vmv1r.v v9, v17 +; LMULMAX1-NEXT: vmv1r.v v10, v18 +; LMULMAX1-NEXT: vmv1r.v v11, v19 +; LMULMAX1-NEXT: vmv1r.v v12, v20 +; LMULMAX1-NEXT: vmv1r.v v13, v21 +; LMULMAX1-NEXT: vmv1r.v v14, v22 +; LMULMAX1-NEXT: vmv1r.v v15, v23 +; LMULMAX1-NEXT: vmv1r.v v16, v24 +; LMULMAX1-NEXT: vmv1r.v v17, v31 +; LMULMAX1-NEXT: vmv1r.v v18, v30 +; LMULMAX1-NEXT: vmv1r.v v19, v29 +; LMULMAX1-NEXT: vmv1r.v v20, v28 +; LMULMAX1-NEXT: vmv1r.v v21, v27 +; LMULMAX1-NEXT: vmv1r.v v22, v26 +; LMULMAX1-NEXT: vmv1r.v v23, v25 +; LMULMAX1-NEXT: call ext2@plt +; LMULMAX1-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 16 +; LMULMAX1-NEXT: ret + %t = call <32 x i32> @ext2(<32 x i32> %y, <32 x i32> %x, i32 %w, i32 2) + ret <32 x i32> %t +} + +define <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -256 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: .cfi_offset s0, -16 +; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX8-NEXT: andi sp, sp, -128 +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: addi a2, zero, 42 +; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: vmv8r.v v8, v24 +; LMULMAX8-NEXT: call ext3@plt +; LMULMAX8-NEXT: addi sp, s0, -256 +; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -256 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: .cfi_offset s0, -16 +; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX4-NEXT: andi sp, sp, -128 +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: addi a0, a0, 64 +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vse32.v v12, (a0) +; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: addi a3, zero, 42 +; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: vmv4r.v v8, v28 +; LMULMAX4-NEXT: vmv4r.v v12, v24 +; LMULMAX4-NEXT: call ext3@plt +; LMULMAX4-NEXT: addi sp, s0, -256 +; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -256 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX2-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: .cfi_offset s0, -16 +; LMULMAX2-NEXT: addi s0, sp, 256 +; LMULMAX2-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX2-NEXT: andi sp, sp, -128 +; LMULMAX2-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: addi a1, a0, 32 +; LMULMAX2-NEXT: vle32.v v28, (a1) +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: addi a0, a0, 96 +; LMULMAX2-NEXT: vle32.v v24, (a0) +; LMULMAX2-NEXT: addi a0, sp, 96 +; LMULMAX2-NEXT: vse32.v v14, (a0) +; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: vse32.v v12, (a0) +; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: mv a0, sp +; LMULMAX2-NEXT: addi a5, zero, 42 +; LMULMAX2-NEXT: vse32.v v8, (sp) +; LMULMAX2-NEXT: vmv2r.v v8, v26 +; LMULMAX2-NEXT: vmv2r.v v10, v28 +; LMULMAX2-NEXT: vmv2r.v v12, v30 +; LMULMAX2-NEXT: vmv2r.v v14, v24 +; LMULMAX2-NEXT: call ext3@plt +; LMULMAX2-NEXT: addi sp, s0, -256 +; LMULMAX2-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 256 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -384 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 384 +; LMULMAX1-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: sd s0, 368(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: .cfi_offset s0, -16 +; LMULMAX1-NEXT: addi s0, sp, 384 +; LMULMAX1-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX1-NEXT: andi sp, sp, -128 +; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a0) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v28, (a1) +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vle32.v v29, (a1) +; LMULMAX1-NEXT: addi a1, a0, 80 +; LMULMAX1-NEXT: vle32.v v30, (a1) +; LMULMAX1-NEXT: addi a1, a0, 96 +; LMULMAX1-NEXT: vle32.v v31, (a1) +; LMULMAX1-NEXT: addi a0, a0, 112 +; LMULMAX1-NEXT: vle32.v v24, (a0) +; LMULMAX1-NEXT: ld a1, 0(s0) +; LMULMAX1-NEXT: addi a0, sp, 240 +; LMULMAX1-NEXT: vse32.v v15, (a0) +; LMULMAX1-NEXT: addi a0, sp, 224 +; LMULMAX1-NEXT: vse32.v v14, (a0) +; LMULMAX1-NEXT: addi a0, sp, 208 +; LMULMAX1-NEXT: vse32.v v13, (a0) +; LMULMAX1-NEXT: addi a0, sp, 192 +; LMULMAX1-NEXT: vse32.v v12, (a0) +; LMULMAX1-NEXT: addi a0, sp, 176 +; LMULMAX1-NEXT: vse32.v v11, (a0) +; LMULMAX1-NEXT: addi a0, sp, 160 +; LMULMAX1-NEXT: vse32.v v10, (a0) +; LMULMAX1-NEXT: addi a0, sp, 144 +; LMULMAX1-NEXT: vse32.v v9, (a0) +; LMULMAX1-NEXT: addi a0, sp, 128 +; LMULMAX1-NEXT: vse32.v v8, (a0) +; LMULMAX1-NEXT: addi a0, zero, 42 +; LMULMAX1-NEXT: sd a0, 8(sp) +; LMULMAX1-NEXT: addi a0, sp, 128 +; LMULMAX1-NEXT: sd a1, 0(sp) +; LMULMAX1-NEXT: vmv1r.v v8, v25 +; LMULMAX1-NEXT: vmv1r.v v9, v26 +; LMULMAX1-NEXT: vmv1r.v v10, v27 +; LMULMAX1-NEXT: vmv1r.v v11, v28 +; LMULMAX1-NEXT: vmv1r.v v12, v29 +; LMULMAX1-NEXT: vmv1r.v v13, v30 +; LMULMAX1-NEXT: vmv1r.v v14, v31 +; LMULMAX1-NEXT: vmv1r.v v15, v24 +; LMULMAX1-NEXT: call ext3@plt +; LMULMAX1-NEXT: addi sp, s0, -384 +; LMULMAX1-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 384 +; LMULMAX1-NEXT: ret + %t = call <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42) + ret <32 x i32> %t +} + +; Test various configurations of split vector types where the values are split +; across both registers and the stack. +; LMUL8: Ins: v8,v9,v10,v11,v12, v16m8 y[0:31], a0+0 z[0:31] +; LMUL4: Ins: v8,v9,v10,v11,v12, v16m4 y[0:15], v20m4 y[16:31], a0+0 z[0:15], +; a0+64 z[16:31] +; LMUL2: Ins: v8,v9,v10,v11,v12, v14m2 y[0:7], v16m2 y[8:15], v18m2 y[16:23], +; v20m2 y[24:31], v22m2 z[0:7], a1+0 z[8:15], a1+32 z[16:23], +; a1+64 z[24:31] +; LMUL1: Ins: v8,v9,v10,v11,v12, v13 y[0:3], v14 y[4:7], v15 y[8:11], +; v16 y[12:15], v17 y[16:19], v18 y[20:23], v19 y[24:27], +; v20 y[28:31], v21 z[0:3], v22 z[4:7], v23 z[8:11], +; a1+0 z[12:15], a1+16 z[16:19], a1+32 z[20:23], a1+48 z[24:27], +; a1+64 z[28:31] +define <32 x i32> @split_vector_args(<2 x i32>,<2 x i32>,<2 x i32>,<2 x i32>,<2 x i32>, <32 x i32> %y, <32 x i32> %z) { +; LMULMAX8-LABEL: split_vector_args: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: vadd.vv v8, v16, v8 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: split_vector_args: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vsetivli a2, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: vle32.v v12, (a1) +; LMULMAX4-NEXT: vadd.vv v8, v16, v28 +; LMULMAX4-NEXT: vadd.vv v12, v20, v12 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: split_vector_args: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi a1, a0, 64 +; LMULMAX2-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: addi a2, a0, 32 +; LMULMAX2-NEXT: vle32.v v26, (a0) +; LMULMAX2-NEXT: vle32.v v28, (a2) +; LMULMAX2-NEXT: vle32.v v30, (a1) +; LMULMAX2-NEXT: vadd.vv v8, v14, v22 +; LMULMAX2-NEXT: vadd.vv v10, v16, v26 +; LMULMAX2-NEXT: vadd.vv v12, v18, v28 +; LMULMAX2-NEXT: vadd.vv v14, v20, v30 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: split_vector_args: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 64 +; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v25, (a1) +; LMULMAX1-NEXT: addi a1, a0, 48 +; LMULMAX1-NEXT: vle32.v v26, (a1) +; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: vle32.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle32.v v28, (a1) +; LMULMAX1-NEXT: vle32.v v29, (a0) +; LMULMAX1-NEXT: vadd.vv v8, v13, v21 +; LMULMAX1-NEXT: vadd.vv v9, v14, v22 +; LMULMAX1-NEXT: vadd.vv v10, v15, v23 +; LMULMAX1-NEXT: vadd.vv v11, v16, v29 +; LMULMAX1-NEXT: vadd.vv v12, v17, v28 +; LMULMAX1-NEXT: vadd.vv v13, v18, v27 +; LMULMAX1-NEXT: vadd.vv v14, v19, v26 +; LMULMAX1-NEXT: vadd.vv v15, v20, v25 +; LMULMAX1-NEXT: ret + %v0 = add <32 x i32> %y, %z + ret <32 x i32> %v0 +} + +define <32 x i32> @call_split_vector_args(<2 x i32>* %pa, <32 x i32>* %pb) { +; LMULMAX8-LABEL: call_split_vector_args: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -256 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: .cfi_offset s0, -16 +; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX8-NEXT: andi sp, sp, -128 +; LMULMAX8-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v16, (a1) +; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: vse32.v v16, (sp) +; LMULMAX8-NEXT: vmv1r.v v9, v8 +; LMULMAX8-NEXT: vmv1r.v v10, v8 +; LMULMAX8-NEXT: vmv1r.v v11, v8 +; LMULMAX8-NEXT: vmv1r.v v12, v8 +; LMULMAX8-NEXT: call split_vector_args@plt +; LMULMAX8-NEXT: addi sp, s0, -256 +; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: call_split_vector_args: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -256 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: .cfi_offset s0, -16 +; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX4-NEXT: andi sp, sp, -128 +; LMULMAX4-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: vsetivli a0, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v16, (a1) +; LMULMAX4-NEXT: addi a0, a1, 64 +; LMULMAX4-NEXT: vle32.v v20, (a0) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vse32.v v20, (a0) +; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: vse32.v v16, (sp) +; LMULMAX4-NEXT: vmv1r.v v9, v8 +; LMULMAX4-NEXT: vmv1r.v v10, v8 +; LMULMAX4-NEXT: vmv1r.v v11, v8 +; LMULMAX4-NEXT: vmv1r.v v12, v8 +; LMULMAX4-NEXT: call split_vector_args@plt +; LMULMAX4-NEXT: addi sp, s0, -256 +; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: ret +; +; LMULMAX2-LABEL: call_split_vector_args: +; LMULMAX2: # %bb.0: +; LMULMAX2-NEXT: addi sp, sp, -256 +; LMULMAX2-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX2-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX2-NEXT: .cfi_offset ra, -8 +; LMULMAX2-NEXT: .cfi_offset s0, -16 +; LMULMAX2-NEXT: addi s0, sp, 256 +; LMULMAX2-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX2-NEXT: andi sp, sp, -128 +; LMULMAX2-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vsetivli a0, 8, e32,m2,ta,mu +; LMULMAX2-NEXT: vle32.v v14, (a1) +; LMULMAX2-NEXT: addi a0, a1, 32 +; LMULMAX2-NEXT: vle32.v v16, (a0) +; LMULMAX2-NEXT: addi a0, a1, 64 +; LMULMAX2-NEXT: vle32.v v18, (a0) +; LMULMAX2-NEXT: addi a0, a1, 96 +; LMULMAX2-NEXT: vle32.v v20, (a0) +; LMULMAX2-NEXT: addi a0, sp, 64 +; LMULMAX2-NEXT: vse32.v v20, (a0) +; LMULMAX2-NEXT: addi a0, sp, 32 +; LMULMAX2-NEXT: vse32.v v18, (a0) +; LMULMAX2-NEXT: mv a0, sp +; LMULMAX2-NEXT: vse32.v v16, (sp) +; LMULMAX2-NEXT: vmv1r.v v9, v8 +; LMULMAX2-NEXT: vmv1r.v v10, v8 +; LMULMAX2-NEXT: vmv1r.v v11, v8 +; LMULMAX2-NEXT: vmv1r.v v12, v8 +; LMULMAX2-NEXT: vmv2r.v v22, v14 +; LMULMAX2-NEXT: call split_vector_args@plt +; LMULMAX2-NEXT: addi sp, s0, -256 +; LMULMAX2-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX2-NEXT: addi sp, sp, 256 +; LMULMAX2-NEXT: ret +; +; LMULMAX1-LABEL: call_split_vector_args: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi sp, sp, -256 +; LMULMAX1-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX1-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX1-NEXT: .cfi_offset ra, -8 +; LMULMAX1-NEXT: .cfi_offset s0, -16 +; LMULMAX1-NEXT: addi s0, sp, 256 +; LMULMAX1-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX1-NEXT: andi sp, sp, -128 +; LMULMAX1-NEXT: vsetivli a2, 2, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetivli a0, 4, e32,m1,ta,mu +; LMULMAX1-NEXT: vle32.v v13, (a1) +; LMULMAX1-NEXT: addi a0, a1, 16 +; LMULMAX1-NEXT: vle32.v v14, (a0) +; LMULMAX1-NEXT: addi a0, a1, 32 +; LMULMAX1-NEXT: vle32.v v15, (a0) +; LMULMAX1-NEXT: addi a0, a1, 48 +; LMULMAX1-NEXT: vle32.v v16, (a0) +; LMULMAX1-NEXT: addi a0, a1, 64 +; LMULMAX1-NEXT: vle32.v v17, (a0) +; LMULMAX1-NEXT: addi a0, a1, 80 +; LMULMAX1-NEXT: vle32.v v18, (a0) +; LMULMAX1-NEXT: addi a0, a1, 96 +; LMULMAX1-NEXT: vle32.v v19, (a0) +; LMULMAX1-NEXT: addi a0, a1, 112 +; LMULMAX1-NEXT: vle32.v v20, (a0) +; LMULMAX1-NEXT: addi a0, sp, 64 +; LMULMAX1-NEXT: vse32.v v20, (a0) +; LMULMAX1-NEXT: addi a0, sp, 48 +; LMULMAX1-NEXT: vse32.v v19, (a0) +; LMULMAX1-NEXT: addi a0, sp, 32 +; LMULMAX1-NEXT: vse32.v v18, (a0) +; LMULMAX1-NEXT: addi a0, sp, 16 +; LMULMAX1-NEXT: vse32.v v17, (a0) +; LMULMAX1-NEXT: mv a0, sp +; LMULMAX1-NEXT: vse32.v v16, (sp) +; LMULMAX1-NEXT: vmv1r.v v9, v8 +; LMULMAX1-NEXT: vmv1r.v v10, v8 +; LMULMAX1-NEXT: vmv1r.v v11, v8 +; LMULMAX1-NEXT: vmv1r.v v12, v8 +; LMULMAX1-NEXT: vmv1r.v v21, v13 +; LMULMAX1-NEXT: vmv1r.v v22, v14 +; LMULMAX1-NEXT: vmv1r.v v23, v15 +; LMULMAX1-NEXT: call split_vector_args@plt +; LMULMAX1-NEXT: addi sp, s0, -256 +; LMULMAX1-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX1-NEXT: addi sp, sp, 256 +; LMULMAX1-NEXT: ret + %a = load <2 x i32>, <2 x i32>* %pa + %b = load <32 x i32>, <32 x i32>* %pb + %r = call <32 x i32> @split_vector_args(<2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <2 x i32> %a, <32 x i32> %b, <32 x i32> %b) + ret <32 x i32> %r +}