diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -15,6 +15,7 @@ #define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H #include "RISCV.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -481,12 +482,24 @@ bool shouldRemoveExtendFromGSIndex(EVT VT) const override; private: + /// RISCVCCAssignFn - This target-specific function extends the default + /// CCValAssign with additional information used to lower RISC-V calling + /// conventions. + typedef bool RISCVCCAssignFn(const DataLayout &DL, RISCVABI::ABI, + unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, + bool IsFixed, bool IsRet, Type *OrigTy, + const RISCVTargetLowering &TLI, + Optional FirstMaskArgument); + void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, - const SmallVectorImpl &Ins, - bool IsRet) const; + const SmallVectorImpl &Ins, bool IsRet, + RISCVCCAssignFn Fn) const; void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Outs, - bool IsRet, CallLoweringInfo *CLI) const; + bool IsRet, CallLoweringInfo *CLI, + RISCVCCAssignFn Fn) const; template SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20,7 +20,6 @@ #include "RISCVTargetMachine.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -6646,6 +6645,27 @@ return false; } +static unsigned allocateRVVReg(MVT ValVT, unsigned ValNo, + Optional FirstMaskArgument, + CCState &State, const RISCVTargetLowering &TLI) { + const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); + if (RC == &RISCV::VRRegClass) { + // Assign the first mask argument to V0. + // This is an interim calling convention and it may be changed in the + // future. + if (FirstMaskArgument.hasValue() && ValNo == FirstMaskArgument.getValue()) + return State.AllocateReg(RISCV::V0); + return State.AllocateReg(ArgVRs); + } + if (RC == &RISCV::VRM2RegClass) + return State.AllocateReg(ArgVRM2s); + if (RC == &RISCV::VRM4RegClass) + return State.AllocateReg(ArgVRM4s); + if (RC == &RISCV::VRM8RegClass) + return State.AllocateReg(ArgVRM8s); + llvm_unreachable("Unhandled register class for ValueType"); +} + // Implements the RISC-V calling convention. Returns true upon failure. static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -6794,26 +6814,7 @@ else if (ValVT == MVT::f64 && !UseGPRForF64) Reg = State.AllocateReg(ArgFPR64s); else if (ValVT.isVector()) { - const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT); - if (RC == &RISCV::VRRegClass) { - // Assign the first mask argument to V0. - // This is an interim calling convention and it may be changed in the - // future. - if (FirstMaskArgument.hasValue() && - ValNo == FirstMaskArgument.getValue()) { - Reg = State.AllocateReg(RISCV::V0); - } else { - Reg = State.AllocateReg(ArgVRs); - } - } else if (RC == &RISCV::VRM2RegClass) { - Reg = State.AllocateReg(ArgVRM2s); - } else if (RC == &RISCV::VRM4RegClass) { - Reg = State.AllocateReg(ArgVRM4s); - } else if (RC == &RISCV::VRM8RegClass) { - Reg = State.AllocateReg(ArgVRM8s); - } else { - llvm_unreachable("Unhandled class register for ValueType"); - } + Reg = allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI); if (!Reg) { // For return values, the vector must be passed fully via registers or // via the stack. @@ -6892,7 +6893,8 @@ void RISCVTargetLowering::analyzeInputArgs( MachineFunction &MF, CCState &CCInfo, - const SmallVectorImpl &Ins, bool IsRet) const { + const SmallVectorImpl &Ins, bool IsRet, + RISCVCCAssignFn Fn) const { unsigned NumArgs = Ins.size(); FunctionType *FType = MF.getFunction().getFunctionType(); @@ -6911,9 +6913,9 @@ ArgTy = FType->getParamType(Ins[i].getOrigArgIndex()); RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); - if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this, - FirstMaskArgument)) { + if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, + ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this, + FirstMaskArgument)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << '\n'); llvm_unreachable(nullptr); @@ -6924,7 +6926,7 @@ void RISCVTargetLowering::analyzeOutputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Outs, bool IsRet, - CallLoweringInfo *CLI) const { + CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const { unsigned NumArgs = Outs.size(); Optional FirstMaskArgument; @@ -6937,9 +6939,9 @@ Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr; RISCVABI::ABI ABI = MF.getSubtarget().getTargetABI(); - if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this, - FirstMaskArgument)) { + if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full, + ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this, + FirstMaskArgument)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"); llvm_unreachable(nullptr); @@ -7084,16 +7086,21 @@ // FastCC has less than 1% performance improvement for some particular // benchmark. But theoretically, it may has benenfit for some cases. -static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT, +static bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, + unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { + ISD::ArgFlagsTy ArgFlags, CCState &State, + bool IsFixed, bool IsRet, Type *OrigTy, + const RISCVTargetLowering &TLI, + Optional FirstMaskArgument) { + + // X5 and X6 might be used for save-restore libcall. + static const MCPhysReg GPRList[] = { + RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, + RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28, + RISCV::X29, RISCV::X30, RISCV::X31}; if (LocVT == MVT::i32 || LocVT == MVT::i64) { - // X5 and X6 might be used for save-restore libcall. - static const MCPhysReg GPRList[] = { - RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14, - RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28, - RISCV::X29, RISCV::X30, RISCV::X31}; if (unsigned Reg = State.AllocateReg(GPRList)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; @@ -7148,6 +7155,35 @@ return false; } + if (LocVT.isVector()) { + if (unsigned Reg = + allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI)) { + // Fixed-length vectors are located in the corresponding scalable-vector + // container types. + if (ValVT.isFixedLengthVector()) + LocVT = TLI.getContainerForFixedLengthVector(LocVT); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + } else { + // Try and pass the address via a "fast" GPR. + if (unsigned GPRReg = State.AllocateReg(GPRList)) { + LocInfo = CCValAssign::Indirect; + LocVT = TLI.getSubtarget().getXLenVT(); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, GPRReg, LocVT, LocInfo)); + } else if (ValVT.isFixedLengthVector()) { + auto StackAlign = Align(std::max(1ul, ValVT.getScalarSizeInBits() / 8)); + unsigned StackOffset = + State.AllocateStack(ValVT.getStoreSize(), StackAlign); + State.addLoc( + CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + } else { + // Can't pass scalable vectors on the stack. + return true; + } + } + + return false; + } + return true; // CC didn't match. } @@ -7240,12 +7276,12 @@ SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - if (CallConv == CallingConv::Fast) - CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC); - else if (CallConv == CallingConv::GHC) + if (CallConv == CallingConv::GHC) CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_GHC); else - analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false); + analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false, + CallConv == CallingConv::Fast ? CC_RISCV_FastCC + : CC_RISCV); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -7450,12 +7486,12 @@ SmallVector ArgLocs; CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - if (CallConv == CallingConv::Fast) - ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC); - else if (CallConv == CallingConv::GHC) + if (CallConv == CallingConv::GHC) ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_GHC); else - analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI); + analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI, + CallConv == CallingConv::Fast ? CC_RISCV_FastCC + : CC_RISCV); // Check if it's really possible to do a tail call. if (IsTailCall) @@ -7697,7 +7733,7 @@ // Assign locations to each value returned by this call. SmallVector RVLocs; CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); - analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true); + analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_RISCV); // Copy all of the result registers out of their specified physreg. for (auto &VA : RVLocs) { @@ -7765,7 +7801,7 @@ *DAG.getContext()); analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true, - nullptr); + nullptr, CC_RISCV); if (CallConv == CallingConv::GHC && !RVLocs.empty()) report_fatal_error("GHC functions return void only"); diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv-fastcc.ll @@ -0,0 +1,602 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 + +define fastcc @ret_nxv4i8(* %p) { +; CHECK-LABEL: ret_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8,mf2,ta,mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %p + ret %v +} + +define fastcc @ret_nxv4i32(* %p) { +; CHECK-LABEL: ret_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vl2re32.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %p + ret %v +} + +define fastcc @ret_nxv8i32(* %p) { +; CHECK-LABEL: ret_nxv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vl4re32.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %p + ret %v +} + +define fastcc @ret_nxv16i64(* %p) { +; CHECK-LABEL: ret_nxv16i64: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, a0, a1 +; CHECK-NEXT: vl8re64.v v16, (a1) +; CHECK-NEXT: vl8re64.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %p + ret %v +} + +define fastcc @ret_mask_nxv8i1(* %p) { +; CHECK-LABEL: ret_mask_nxv8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: ret + %v = load , * %p + ret %v +} + +define fastcc @ret_mask_nxv32i1(* %p) { +; CHECK-LABEL: ret_mask_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8,m4,ta,mu +; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: ret + %v = load , * %p + ret %v +} + +; Return the vector via registers v8-v23 +define fastcc @ret_split_nxv64i32(* %x) { +; CHECK-LABEL: ret_split_nxv64i32: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 3 +; CHECK-NEXT: slli a3, a2, 6 +; CHECK-NEXT: add a4, a1, a3 +; CHECK-NEXT: vl8re32.v v8, (a4) +; CHECK-NEXT: slli a4, a2, 7 +; CHECK-NEXT: addi a5, zero, 192 +; CHECK-NEXT: mul a2, a2, a5 +; CHECK-NEXT: add a5, a1, a4 +; CHECK-NEXT: vl8re32.v v16, (a1) +; CHECK-NEXT: add a1, a1, a2 +; CHECK-NEXT: vl8re32.v v24, (a1) +; CHECK-NEXT: vl8re32.v v0, (a5) +; CHECK-NEXT: vs8r.v v16, (a0) +; CHECK-NEXT: add a1, a0, a2 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: add a1, a0, a4 +; CHECK-NEXT: vs8r.v v0, (a1) +; CHECK-NEXT: add a0, a0, a3 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: ret + %v = load , * %x + ret %v +} + +; Return the vector fully via the stack +define fastcc @ret_split_nxv128i32(* %x) { +; CHECK-LABEL: ret_split_nxv128i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 5 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 3 +; CHECK-NEXT: slli a6, a2, 6 +; CHECK-NEXT: add a4, a1, a6 +; CHECK-NEXT: vl8re32.v v8, (a4) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: addi a4, zero, 24 +; CHECK-NEXT: mul a3, a3, a4 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: slli a7, a2, 7 +; CHECK-NEXT: add a5, a1, a7 +; CHECK-NEXT: vl8re32.v v8, (a5) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 4 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: addi a5, zero, 192 +; CHECK-NEXT: mul t1, a2, a5 +; CHECK-NEXT: add a3, a1, t1 +; CHECK-NEXT: vl8re32.v v8, (a3) +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: slli a3, a3, 3 +; CHECK-NEXT: add a3, sp, a3 +; CHECK-NEXT: addi a3, a3, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: slli t3, a2, 8 +; CHECK-NEXT: add a4, a1, t3 +; CHECK-NEXT: vl8re32.v v8, (a4) +; CHECK-NEXT: addi a3, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill +; CHECK-NEXT: addi a4, zero, 320 +; CHECK-NEXT: mul a4, a2, a4 +; CHECK-NEXT: add t0, a1, a4 +; CHECK-NEXT: addi a5, zero, 384 +; CHECK-NEXT: mul a5, a2, a5 +; CHECK-NEXT: add t2, a1, a5 +; CHECK-NEXT: addi a3, zero, 448 +; CHECK-NEXT: mul a2, a2, a3 +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: vl8re32.v v8, (a1) +; CHECK-NEXT: vl8re32.v v0, (t0) +; CHECK-NEXT: vl8re32.v v16, (a3) +; CHECK-NEXT: vl8re32.v v24, (t2) +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: add a1, a0, a2 +; CHECK-NEXT: vs8r.v v16, (a1) +; CHECK-NEXT: add a1, a0, a5 +; CHECK-NEXT: vs8r.v v24, (a1) +; CHECK-NEXT: add a1, a0, a4 +; CHECK-NEXT: vs8r.v v0, (a1) +; CHECK-NEXT: add a1, a0, t3 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v8, (a1) +; CHECK-NEXT: add a1, a0, t1 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v8, (a1) +; CHECK-NEXT: add a1, a0, a7 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vl8re8.v v8, (a2) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v8, (a1) +; CHECK-NEXT: add a0, a0, a6 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: addi a2, zero, 24 +; CHECK-NEXT: mul a1, a1, a2 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = load , * %x + ret %v +} + +define fastcc @ret_nxv4i8_param_nxv4i8_nxv4i8( %v, %w) { +; CHECK-LABEL: ret_nxv4i8_param_nxv4i8_nxv4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = add %v, %w + ret %r +} + +define fastcc @ret_nxv4i64_param_nxv4i64_nxv4i64( %v, %w) { +; CHECK-LABEL: ret_nxv4i64_param_nxv4i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: ret + %r = add %v, %w + ret %r +} + +define fastcc @ret_nxv8i1_param_nxv8i1_nxv8i1( %v, %w) { +; CHECK-LABEL: ret_nxv8i1_param_nxv8i1_nxv8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,m1,ta,mu +; CHECK-NEXT: vmxor.mm v0, v0, v8 +; CHECK-NEXT: ret + %r = xor %v, %w + ret %r +} + +define fastcc @ret_nxv32i1_param_nxv32i1_nxv32i1( %v, %w) { +; CHECK-LABEL: ret_nxv32i1_param_nxv32i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8,m4,ta,mu +; CHECK-NEXT: vmand.mm v0, v0, v8 +; CHECK-NEXT: ret + %r = and %v, %w + ret %r +} + +define fastcc @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32( %x, %y, %z, i32 %w) { +; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: vl8re32.v v8, (a2) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, a2, 64 +; CHECK-NEXT: vl8re32.v v0, (a0) +; CHECK-NEXT: addi a0, a0, 64 +; CHECK-NEXT: vl8re32.v v8, (a0) +; CHECK-NEXT: vl8re32.v v16, (a1) +; CHECK-NEXT: vsetvli a0, zero, e32,m8,ta,mu +; CHECK-NEXT: vadd.vv v24, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v0, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v8, v0, v8 +; CHECK-NEXT: vadd.vv v16, v8, v16 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vadd.vv v8, v24, v8 +; CHECK-NEXT: vadd.vx v8, v8, a4 +; CHECK-NEXT: vadd.vx v16, v16, a4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %r = add %x, %y + %s = add %r, %z + %head = insertelement undef, i32 %w, i32 0 + %splat = shufflevector %head, undef, zeroinitializer + %t = add %s, %splat + ret %t +} + +declare @ext2(, , i32, i32) +declare @ext3(, , , i32, i32) + +define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_i32( %x, %y, i32 %w) { +; RV32-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: vl8re32.v v24, (a0) +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: vl8re32.v v0, (a0) +; RV32-NEXT: addi a0, sp, 80 +; RV32-NEXT: vs8r.v v16, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: addi a3, zero, 2 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) +; RV32-NEXT: vmv8r.v v8, v24 +; RV32-NEXT: vmv8r.v v16, v0 +; RV32-NEXT: call ext2@plt +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: vl8re32.v v24, (a0) +; RV64-NEXT: addi a0, a0, 64 +; RV64-NEXT: vl8re32.v v0, (a0) +; RV64-NEXT: addi a0, sp, 88 +; RV64-NEXT: vs8r.v v16, (a0) +; RV64-NEXT: addi a0, sp, 24 +; RV64-NEXT: addi a3, zero, 2 +; RV64-NEXT: addi a1, sp, 24 +; RV64-NEXT: vs8r.v v8, (a1) +; RV64-NEXT: vmv8r.v v8, v24 +; RV64-NEXT: vmv8r.v v16, v0 +; RV64-NEXT: call ext2@plt +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret + %t = call fastcc @ext2( %y, %x, i32 %w, i32 2) + ret %t +} + +define fastcc @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32( %x, %y, %z, i32 %w) { +; RV32-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: addi a3, zero, 48 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: vl8re32.v v24, (a2) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, a2, 64 +; RV32-NEXT: vl8re32.v v24, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vl8re32.v v0, (a0) +; RV32-NEXT: addi a0, a0, 64 +; RV32-NEXT: vl8re32.v v24, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 80 +; RV32-NEXT: vs8r.v v16, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 80 +; RV32-NEXT: vs8r.v v24, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a2, a1, 16 +; RV32-NEXT: addi a5, zero, 42 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v0, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: call ext3@plt +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: addi a1, zero, 48 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: addi a3, zero, 48 +; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: vl8re32.v v24, (a2) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 24 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, a2, 64 +; RV64-NEXT: vl8re32.v v24, (a1) +; RV64-NEXT: addi a1, sp, 24 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: addi a1, a0, 64 +; RV64-NEXT: vl8re32.v v0, (a0) +; RV64-NEXT: vl8re32.v v24, (a1) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 88 +; RV64-NEXT: vs8r.v v16, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 88 +; RV64-NEXT: vs8r.v v24, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 24 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 24 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a2, a1, 24 +; RV64-NEXT: addi a5, zero, 42 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 5 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 24 +; RV64-NEXT: vs8r.v v0, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 24 +; RV64-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: addi a1, sp, 24 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: call ext3@plt +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: addi a1, zero, 48 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret + %t = call fastcc @ext3( %z, %y, %x, i32 %w, i32 42) + ret %t +} + +; A test case where the normal calling convention would pass directly via the +; stack, but with fastcc can pass indirectly with the extra GPR registers +; allowed. +define fastcc @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, %x, %y, %z, i32 %8) { +; CHECK-LABEL: vector_arg_indirect_stack: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, t4, 64 +; CHECK-NEXT: vl8re32.v v24, (t4) +; CHECK-NEXT: vl8re32.v v0, (a0) +; CHECK-NEXT: vsetvli a0, zero, e32,m8,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vadd.vv v16, v16, v0 +; CHECK-NEXT: ret + %s = add %x, %z + ret %s +} + +; Calling the function above. Ensure we pass the arguments correctly. +define fastcc @pass_vector_arg_indirect_stack( %x, %y, %z) { +; RV32-LABEL: pass_vector_arg_indirect_stack: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: sub sp, sp, a0 +; RV32-NEXT: addi a0, sp, 80 +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 80 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: addi a1, zero, 1 +; RV32-NEXT: addi a2, zero, 2 +; RV32-NEXT: addi a3, zero, 3 +; RV32-NEXT: addi a4, zero, 4 +; RV32-NEXT: addi a5, zero, 5 +; RV32-NEXT: addi a6, zero, 6 +; RV32-NEXT: addi a7, zero, 7 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi t2, a0, 16 +; RV32-NEXT: addi t4, sp, 16 +; RV32-NEXT: addi t6, zero, 8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: mv a0, zero +; RV32-NEXT: vmv8r.v v16, v8 +; RV32-NEXT: call vector_arg_indirect_stack@plt +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 5 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: ret +; +; RV64-LABEL: pass_vector_arg_indirect_stack: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: sub sp, sp, a0 +; RV64-NEXT: addi a0, sp, 88 +; RV64-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 88 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: addi a0, sp, 24 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: addi a1, zero, 1 +; RV64-NEXT: addi a2, zero, 2 +; RV64-NEXT: addi a3, zero, 3 +; RV64-NEXT: addi a4, zero, 4 +; RV64-NEXT: addi a5, zero, 5 +; RV64-NEXT: addi a6, zero, 6 +; RV64-NEXT: addi a7, zero, 7 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi t2, a0, 24 +; RV64-NEXT: addi t4, sp, 24 +; RV64-NEXT: addi t6, zero, 8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 24 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: mv a0, zero +; RV64-NEXT: vmv8r.v v16, v8 +; RV64-NEXT: call vector_arg_indirect_stack@plt +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 5 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: ret + %s = call fastcc @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, zeroinitializer, zeroinitializer, zeroinitializer, i32 8) + ret %s +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-calling-conv-fastcc.ll @@ -0,0 +1,557 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=8 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX8 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=4 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX4 + +define fastcc <4 x i8> @ret_v4i8(<4 x i8>* %p) { +; CHECK-LABEL: ret_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e8,mf4,ta,mu +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: ret + %v = load <4 x i8>, <4 x i8>* %p + ret <4 x i8> %v +} + +define fastcc <4 x i32> @ret_v4i32(<4 x i32>* %p) { +; CHECK-LABEL: ret_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %v = load <4 x i32>, <4 x i32>* %p + ret <4 x i32> %v +} + +define fastcc <8 x i32> @ret_v8i32(<8 x i32>* %p) { +; CHECK-LABEL: ret_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: ret + %v = load <8 x i32>, <8 x i32>* %p + ret <8 x i32> %v +} + +define fastcc <16 x i64> @ret_v16i64(<16 x i64>* %p) { +; LMULMAX8-LABEL: ret_v16i64: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: vsetivli a1, 16, e64,m8,ta,mu +; LMULMAX8-NEXT: vle64.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v16i64: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 8, e64,m4,ta,mu +; LMULMAX4-NEXT: vle64.v v8, (a0) +; LMULMAX4-NEXT: addi a0, a0, 64 +; LMULMAX4-NEXT: vle64.v v12, (a0) +; LMULMAX4-NEXT: ret + %v = load <16 x i64>, <16 x i64>* %p + ret <16 x i64> %v +} + +define fastcc <8 x i1> @ret_mask_v8i1(<8 x i1>* %p) { +; CHECK-LABEL: ret_mask_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a1, 8, e8,mf2,ta,mu +; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: ret + %v = load <8 x i1>, <8 x i1>* %p + ret <8 x i1> %v +} + +define fastcc <32 x i1> @ret_mask_v32i1(<32 x i1>* %p) { +; CHECK-LABEL: ret_mask_v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a1, zero, 32 +; CHECK-NEXT: vsetvli a1, a1, e8,m2,ta,mu +; CHECK-NEXT: vle1.v v0, (a0) +; CHECK-NEXT: ret + %v = load <32 x i1>, <32 x i1>* %p + ret <32 x i1> %v +} + +; Return the vector via registers v8-v23 +define fastcc <64 x i32> @ret_split_v64i32(<64 x i32>* %x) { +; LMULMAX8-LABEL: ret_split_v64i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a1, zero, 32 +; LMULMAX8-NEXT: vsetvli a1, a1, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a0) +; LMULMAX8-NEXT: addi a0, a0, 128 +; LMULMAX8-NEXT: vle32.v v16, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_split_v64i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v8, (a0) +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vle32.v v12, (a1) +; LMULMAX4-NEXT: addi a1, a0, 128 +; LMULMAX4-NEXT: vle32.v v16, (a1) +; LMULMAX4-NEXT: addi a0, a0, 192 +; LMULMAX4-NEXT: vle32.v v20, (a0) +; LMULMAX4-NEXT: ret + %v = load <64 x i32>, <64 x i32>* %x + ret <64 x i32> %v +} + +; Return the vector fully via the stack +define fastcc <128 x i32> @ret_split_v128i32(<128 x i32>* %x) { +; LMULMAX8-LABEL: ret_split_v128i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v8, (a1) +; LMULMAX8-NEXT: addi a2, a1, 128 +; LMULMAX8-NEXT: vle32.v v16, (a2) +; LMULMAX8-NEXT: addi a2, a1, 384 +; LMULMAX8-NEXT: vle32.v v24, (a2) +; LMULMAX8-NEXT: addi a1, a1, 256 +; LMULMAX8-NEXT: vle32.v v0, (a1) +; LMULMAX8-NEXT: addi a1, a0, 384 +; LMULMAX8-NEXT: vse32.v v24, (a1) +; LMULMAX8-NEXT: addi a1, a0, 256 +; LMULMAX8-NEXT: vse32.v v0, (a1) +; LMULMAX8-NEXT: addi a1, a0, 128 +; LMULMAX8-NEXT: vse32.v v16, (a1) +; LMULMAX8-NEXT: vse32.v v8, (a0) +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_split_v128i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a2, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a1) +; LMULMAX4-NEXT: addi a2, a1, 64 +; LMULMAX4-NEXT: vle32.v v8, (a2) +; LMULMAX4-NEXT: addi a2, a1, 128 +; LMULMAX4-NEXT: vle32.v v12, (a2) +; LMULMAX4-NEXT: addi a2, a1, 192 +; LMULMAX4-NEXT: vle32.v v16, (a2) +; LMULMAX4-NEXT: addi a2, a1, 256 +; LMULMAX4-NEXT: vle32.v v20, (a2) +; LMULMAX4-NEXT: addi a2, a1, 320 +; LMULMAX4-NEXT: vle32.v v24, (a2) +; LMULMAX4-NEXT: addi a2, a1, 448 +; LMULMAX4-NEXT: vle32.v v0, (a2) +; LMULMAX4-NEXT: addi a1, a1, 384 +; LMULMAX4-NEXT: vle32.v v4, (a1) +; LMULMAX4-NEXT: addi a1, a0, 448 +; LMULMAX4-NEXT: vse32.v v0, (a1) +; LMULMAX4-NEXT: addi a1, a0, 384 +; LMULMAX4-NEXT: vse32.v v4, (a1) +; LMULMAX4-NEXT: addi a1, a0, 320 +; LMULMAX4-NEXT: vse32.v v24, (a1) +; LMULMAX4-NEXT: addi a1, a0, 256 +; LMULMAX4-NEXT: vse32.v v20, (a1) +; LMULMAX4-NEXT: addi a1, a0, 192 +; LMULMAX4-NEXT: vse32.v v16, (a1) +; LMULMAX4-NEXT: addi a1, a0, 128 +; LMULMAX4-NEXT: vse32.v v12, (a1) +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vse32.v v8, (a1) +; LMULMAX4-NEXT: vse32.v v28, (a0) +; LMULMAX4-NEXT: ret + %v = load <128 x i32>, <128 x i32>* %x + ret <128 x i32> %v +} + +define fastcc <4 x i8> @ret_v8i8_param_v4i8(<4 x i8> %v) { +; CHECK-LABEL: ret_v8i8_param_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,mf4,ta,mu +; CHECK-NEXT: vadd.vi v8, v8, 2 +; CHECK-NEXT: ret + %r = add <4 x i8> %v, + ret <4 x i8> %r +} + +define fastcc <4 x i8> @ret_v4i8_param_v4i8_v4i8(<4 x i8> %v, <4 x i8> %w) { +; CHECK-LABEL: ret_v4i8_param_v4i8_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,mf4,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: ret + %r = add <4 x i8> %v, %w + ret <4 x i8> %r +} + +define fastcc <4 x i64> @ret_v4i64_param_v4i64_v4i64(<4 x i64> %v, <4 x i64> %w) { +; CHECK-LABEL: ret_v4i64_param_v4i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e64,m2,ta,mu +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: ret + %r = add <4 x i64> %v, %w + ret <4 x i64> %r +} + +define fastcc <8 x i1> @ret_v8i1_param_v8i1_v8i1(<8 x i1> %v, <8 x i1> %w) { +; CHECK-LABEL: ret_v8i1_param_v8i1_v8i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 8, e8,mf2,ta,mu +; CHECK-NEXT: vmxor.mm v0, v0, v8 +; CHECK-NEXT: ret + %r = xor <8 x i1> %v, %w + ret <8 x i1> %r +} + +define fastcc <32 x i1> @ret_v32i1_param_v32i1_v32i1(<32 x i1> %v, <32 x i1> %w) { +; CHECK-LABEL: ret_v32i1_param_v32i1_v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, zero, 32 +; CHECK-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; CHECK-NEXT: vmand.mm v0, v0, v8 +; CHECK-NEXT: ret + %r = and <32 x i1> %v, %w + ret <32 x i1> %r +} + +define fastcc <32 x i32> @ret_v32i32_param_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: vadd.vv v8, v8, v24 +; LMULMAX8-NEXT: vadd.vx v8, v8, a1 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: addi a1, a0, 64 +; LMULMAX4-NEXT: vle32.v v28, (a1) +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: vadd.vv v8, v8, v16 +; LMULMAX4-NEXT: vadd.vv v12, v12, v20 +; LMULMAX4-NEXT: vadd.vv v28, v12, v28 +; LMULMAX4-NEXT: vadd.vv v8, v8, v24 +; LMULMAX4-NEXT: vadd.vx v8, v8, a2 +; LMULMAX4-NEXT: vadd.vx v12, v28, a2 +; LMULMAX4-NEXT: ret + %r = add <32 x i32> %x, %y + %s = add <32 x i32> %r, %z + %head = insertelement <32 x i32> undef, i32 %w, i32 0 + %splat = shufflevector <32 x i32> %head, <32 x i32> undef, <32 x i32> zeroinitializer + %t = add <32 x i32> %s, %splat + ret <32 x i32> %t +} + +declare <32 x i32> @ext2(<32 x i32>, <32 x i32>, i32, i32) +declare <32 x i32> @ext3(<32 x i32>, <32 x i32>, <32 x i32>, i32, i32) + +define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -16 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX8-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: vmv8r.v v24, v8 +; LMULMAX8-NEXT: addi a1, zero, 2 +; LMULMAX8-NEXT: vmv8r.v v8, v16 +; LMULMAX8-NEXT: vmv8r.v v16, v24 +; LMULMAX8-NEXT: call ext2@plt +; LMULMAX8-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 16 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -16 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX4-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: vmv4r.v v28, v12 +; LMULMAX4-NEXT: vmv4r.v v24, v8 +; LMULMAX4-NEXT: addi a1, zero, 2 +; LMULMAX4-NEXT: vmv4r.v v8, v16 +; LMULMAX4-NEXT: vmv4r.v v12, v20 +; LMULMAX4-NEXT: vmv4r.v v16, v24 +; LMULMAX4-NEXT: vmv4r.v v20, v28 +; LMULMAX4-NEXT: call ext2@plt +; LMULMAX4-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 16 +; LMULMAX4-NEXT: ret + %t = call fastcc <32 x i32> @ext2(<32 x i32> %y, <32 x i32> %x, i32 %w, i32 2) + ret <32 x i32> %t +} + +define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { +; LMULMAX8-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -256 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: .cfi_offset s0, -16 +; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX8-NEXT: andi sp, sp, -128 +; LMULMAX8-NEXT: addi a2, zero, 32 +; LMULMAX8-NEXT: vsetvli a2, a2, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: mv a0, sp +; LMULMAX8-NEXT: addi a2, zero, 42 +; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: vmv8r.v v8, v24 +; LMULMAX8-NEXT: call ext3@plt +; LMULMAX8-NEXT: addi sp, s0, -256 +; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -256 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: .cfi_offset s0, -16 +; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX4-NEXT: andi sp, sp, -128 +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: addi a0, a0, 64 +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vse32.v v12, (a0) +; LMULMAX4-NEXT: mv a0, sp +; LMULMAX4-NEXT: addi a3, zero, 42 +; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: vmv4r.v v8, v28 +; LMULMAX4-NEXT: vmv4r.v v12, v24 +; LMULMAX4-NEXT: call ext3@plt +; LMULMAX4-NEXT: addi sp, s0, -256 +; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: ret + %t = call fastcc <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42) + ret <32 x i32> %t +} + +; A test case where the normal calling convention would pass directly via the +; stack, but with fastcc can pass indirectly with the extra GPR registers +; allowed. +define fastcc <32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) { +; LMULMAX8-LABEL: vector_arg_indirect_stack: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vle32.v v16, (t2) +; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: vector_arg_indirect_stack: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi a0, t2, 64 +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vle32.v v28, (t2) +; LMULMAX4-NEXT: vle32.v v16, (a0) +; LMULMAX4-NEXT: vadd.vv v8, v8, v28 +; LMULMAX4-NEXT: vadd.vv v12, v12, v16 +; LMULMAX4-NEXT: ret + %s = add <32 x i32> %x, %z + ret <32 x i32> %s +} + +; Calling the function above. Ensure we pass the arguments correctly. +define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { +; LMULMAX8-LABEL: pass_vector_arg_indirect_stack: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -256 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX8-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: .cfi_offset s0, -16 +; LMULMAX8-NEXT: addi s0, sp, 256 +; LMULMAX8-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX8-NEXT: andi sp, sp, -128 +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vmv.v.i v8, 0 +; LMULMAX8-NEXT: addi a1, zero, 1 +; LMULMAX8-NEXT: addi a2, zero, 2 +; LMULMAX8-NEXT: addi a3, zero, 3 +; LMULMAX8-NEXT: addi a4, zero, 4 +; LMULMAX8-NEXT: addi a5, zero, 5 +; LMULMAX8-NEXT: addi a6, zero, 6 +; LMULMAX8-NEXT: addi a7, zero, 7 +; LMULMAX8-NEXT: mv t2, sp +; LMULMAX8-NEXT: addi t3, zero, 8 +; LMULMAX8-NEXT: vse32.v v8, (sp) +; LMULMAX8-NEXT: mv a0, zero +; LMULMAX8-NEXT: vmv8r.v v16, v8 +; LMULMAX8-NEXT: call vector_arg_indirect_stack@plt +; LMULMAX8-NEXT: addi sp, s0, -256 +; LMULMAX8-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 256 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: pass_vector_arg_indirect_stack: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -256 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 256 +; LMULMAX4-NEXT: sd ra, 248(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: .cfi_offset s0, -16 +; LMULMAX4-NEXT: addi s0, sp, 256 +; LMULMAX4-NEXT: .cfi_def_cfa s0, 0 +; LMULMAX4-NEXT: andi sp, sp, -128 +; LMULMAX4-NEXT: addi a0, sp, 64 +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 +; LMULMAX4-NEXT: vse32.v v8, (a0) +; LMULMAX4-NEXT: addi a1, zero, 1 +; LMULMAX4-NEXT: addi a2, zero, 2 +; LMULMAX4-NEXT: addi a3, zero, 3 +; LMULMAX4-NEXT: addi a4, zero, 4 +; LMULMAX4-NEXT: addi a5, zero, 5 +; LMULMAX4-NEXT: addi a6, zero, 6 +; LMULMAX4-NEXT: addi a7, zero, 7 +; LMULMAX4-NEXT: mv t2, sp +; LMULMAX4-NEXT: addi t4, zero, 8 +; LMULMAX4-NEXT: vse32.v v8, (sp) +; LMULMAX4-NEXT: mv a0, zero +; LMULMAX4-NEXT: vmv4r.v v12, v8 +; LMULMAX4-NEXT: vmv4r.v v16, v8 +; LMULMAX4-NEXT: vmv4r.v v20, v8 +; LMULMAX4-NEXT: call vector_arg_indirect_stack@plt +; LMULMAX4-NEXT: addi sp, s0, -256 +; LMULMAX4-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: ld ra, 248(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 256 +; LMULMAX4-NEXT: ret + %s = call fastcc <32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 8) + ret <32 x i32> %s +} + +; A pathological test case where even with fastcc we must use the stack for arguments %13 and %z +define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %last) { +; LMULMAX8-LABEL: vector_arg_direct_stack: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: addi a0, sp, 8 +; LMULMAX8-NEXT: vle32.v v24, (a0) +; LMULMAX8-NEXT: vadd.vv v8, v8, v16 +; LMULMAX8-NEXT: vadd.vv v8, v8, v24 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: vector_arg_direct_stack: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: vsetivli a0, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: addi a0, sp, 8 +; LMULMAX4-NEXT: vle32.v v28, (a0) +; LMULMAX4-NEXT: addi a0, sp, 72 +; LMULMAX4-NEXT: vle32.v v24, (a0) +; LMULMAX4-NEXT: vadd.vv v12, v12, v20 +; LMULMAX4-NEXT: vadd.vv v8, v8, v16 +; LMULMAX4-NEXT: vadd.vv v8, v8, v28 +; LMULMAX4-NEXT: vadd.vv v12, v12, v24 +; LMULMAX4-NEXT: ret + %s = add <32 x i32> %x, %y + %t = add <32 x i32> %s, %z + ret <32 x i32> %t +} + +; Calling the function above. Ensure we pass the arguments correctly. +define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { +; LMULMAX8-LABEL: pass_vector_arg_direct_stack: +; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: addi sp, sp, -160 +; LMULMAX8-NEXT: .cfi_def_cfa_offset 160 +; LMULMAX8-NEXT: sd ra, 152(sp) # 8-byte Folded Spill +; LMULMAX8-NEXT: .cfi_offset ra, -8 +; LMULMAX8-NEXT: addi a0, zero, 32 +; LMULMAX8-NEXT: vsetvli a0, a0, e32,m8,ta,mu +; LMULMAX8-NEXT: vmv.v.i v8, 0 +; LMULMAX8-NEXT: addi a0, sp, 8 +; LMULMAX8-NEXT: vse32.v v8, (a0) +; LMULMAX8-NEXT: addi a0, zero, 1 +; LMULMAX8-NEXT: sd a0, 136(sp) +; LMULMAX8-NEXT: addi a0, zero, 13 +; LMULMAX8-NEXT: addi a1, zero, 1 +; LMULMAX8-NEXT: addi a2, zero, 2 +; LMULMAX8-NEXT: addi a3, zero, 3 +; LMULMAX8-NEXT: addi a4, zero, 4 +; LMULMAX8-NEXT: addi a5, zero, 5 +; LMULMAX8-NEXT: addi a6, zero, 6 +; LMULMAX8-NEXT: addi a7, zero, 7 +; LMULMAX8-NEXT: addi t2, zero, 8 +; LMULMAX8-NEXT: addi t3, zero, 9 +; LMULMAX8-NEXT: addi t4, zero, 10 +; LMULMAX8-NEXT: addi t5, zero, 11 +; LMULMAX8-NEXT: addi t6, zero, 12 +; LMULMAX8-NEXT: sd a0, 0(sp) +; LMULMAX8-NEXT: mv a0, zero +; LMULMAX8-NEXT: vmv8r.v v16, v8 +; LMULMAX8-NEXT: call vector_arg_direct_stack@plt +; LMULMAX8-NEXT: ld ra, 152(sp) # 8-byte Folded Reload +; LMULMAX8-NEXT: addi sp, sp, 160 +; LMULMAX8-NEXT: ret +; +; LMULMAX4-LABEL: pass_vector_arg_direct_stack: +; LMULMAX4: # %bb.0: +; LMULMAX4-NEXT: addi sp, sp, -160 +; LMULMAX4-NEXT: .cfi_def_cfa_offset 160 +; LMULMAX4-NEXT: sd ra, 152(sp) # 8-byte Folded Spill +; LMULMAX4-NEXT: .cfi_offset ra, -8 +; LMULMAX4-NEXT: addi a0, zero, 1 +; LMULMAX4-NEXT: sd a0, 136(sp) +; LMULMAX4-NEXT: addi a0, zero, 13 +; LMULMAX4-NEXT: sd a0, 0(sp) +; LMULMAX4-NEXT: addi a0, sp, 72 +; LMULMAX4-NEXT: vsetivli a1, 16, e32,m4,ta,mu +; LMULMAX4-NEXT: vmv.v.i v8, 0 +; LMULMAX4-NEXT: vse32.v v8, (a0) +; LMULMAX4-NEXT: addi a0, sp, 8 +; LMULMAX4-NEXT: addi a1, zero, 1 +; LMULMAX4-NEXT: addi a2, zero, 2 +; LMULMAX4-NEXT: addi a3, zero, 3 +; LMULMAX4-NEXT: addi a4, zero, 4 +; LMULMAX4-NEXT: addi a5, zero, 5 +; LMULMAX4-NEXT: addi a6, zero, 6 +; LMULMAX4-NEXT: addi a7, zero, 7 +; LMULMAX4-NEXT: addi t2, zero, 8 +; LMULMAX4-NEXT: addi t3, zero, 9 +; LMULMAX4-NEXT: addi t4, zero, 10 +; LMULMAX4-NEXT: addi t5, zero, 11 +; LMULMAX4-NEXT: addi t6, zero, 12 +; LMULMAX4-NEXT: vse32.v v8, (a0) +; LMULMAX4-NEXT: mv a0, zero +; LMULMAX4-NEXT: vmv4r.v v12, v8 +; LMULMAX4-NEXT: vmv4r.v v16, v8 +; LMULMAX4-NEXT: vmv4r.v v20, v8 +; LMULMAX4-NEXT: call vector_arg_direct_stack@plt +; LMULMAX4-NEXT: ld ra, 152(sp) # 8-byte Folded Reload +; LMULMAX4-NEXT: addi sp, sp, 160 +; LMULMAX4-NEXT: ret + %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1) + ret <32 x i32> %s +} + +; A pathological test case where even with fastcc we must use the stack for +; mask argument %m2. %m1 is passed via v0. +define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) { +; CHECK-LABEL: vector_mask_arg_direct_stack: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli a0, 4, e8,mf4,ta,mu +; CHECK-NEXT: addi a0, sp, 136 +; CHECK-NEXT: vle1.v v25, (a0) +; CHECK-NEXT: vmxor.mm v0, v0, v25 +; CHECK-NEXT: ret + %r = xor <4 x i1> %m1, %m2 + ret <4 x i1> %r +}