diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -85,15 +85,18 @@ private: /// Target-specific function used to lower LoongArch calling conventions. - typedef bool LoongArchCCAssignFn(unsigned ValNo, MVT ValVT, + typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI, + unsigned ValNo, MVT ValVT, CCValAssign::LocInfo LocInfo, - CCState &State); + ISD::ArgFlagsTy ArgFlags, CCState &State, + bool IsFixed, bool IsReg, Type *OrigTy); - void analyzeInputArgs(CCState &CCInfo, - const SmallVectorImpl &Ins, + void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, + const SmallVectorImpl &Ins, bool IsRet, LoongArchCCAssignFn Fn) const; - void analyzeOutputArgs(CCState &CCInfo, + void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Outs, + bool IsRet, CallLoweringInfo *CLI, LoongArchCCAssignFn Fn) const; SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -814,46 +814,227 @@ //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// -// FIXME: Now, we only support CallingConv::C with fixed arguments which are -// passed with integer or floating-point registers. + +// Eight general-purpose registers a0-a7 used for passing pass integer +// arguments, with a0-a1 reused to return values. Generally, the GPRs are used +// to pass fixed-point arguments, and floating-point arguments when no FPR is +// available or with soft float ABI. const MCPhysReg ArgGPRs[] = {LoongArch::R4, LoongArch::R5, LoongArch::R6, LoongArch::R7, LoongArch::R8, LoongArch::R9, LoongArch::R10, LoongArch::R11}; +// Eight floating-point registers fa0-fa7 used for passing pass floating-point +// arguments, and fa0-fa1 are also used to return values except soft float ABI. const MCPhysReg ArgFPR32s[] = {LoongArch::F0, LoongArch::F1, LoongArch::F2, LoongArch::F3, LoongArch::F4, LoongArch::F5, LoongArch::F6, LoongArch::F7}; +// FPR32 and FPR64 alias each other. const MCPhysReg ArgFPR64s[] = { LoongArch::F0_64, LoongArch::F1_64, LoongArch::F2_64, LoongArch::F3_64, LoongArch::F4_64, LoongArch::F5_64, LoongArch::F6_64, LoongArch::F7_64}; +// Pass a 2*GRLen argument that has been split into two GRLen values through +// registers or the stack as necessary. +static bool CC_LoongArchAssign2GRLen(unsigned GRLen, CCState &State, + CCValAssign VA1, ISD::ArgFlagsTy ArgFlags1, + unsigned ValNo2, MVT ValVT2, MVT LocVT2, + ISD::ArgFlagsTy ArgFlags2) { + unsigned GRLenInBytes = GRLen / 8; + if (Register Reg = State.AllocateReg(ArgGPRs)) { + // At least one half can be passed via register. + State.addLoc(CCValAssign::getReg(VA1.getValNo(), VA1.getValVT(), Reg, + VA1.getLocVT(), CCValAssign::Full)); + } else { + // Both halves must be passed on the stack, with proper alignment. + Align StackAlign = + std::max(Align(GRLenInBytes), ArgFlags1.getNonZeroOrigAlign()); + State.addLoc( + CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(), + State.AllocateStack(GRLenInBytes, StackAlign), + VA1.getLocVT(), CCValAssign::Full)); + State.addLoc(CCValAssign::getMem( + ValNo2, ValVT2, State.AllocateStack(GRLenInBytes, Align(GRLenInBytes)), + LocVT2, CCValAssign::Full)); + return false; + } + if (Register Reg = State.AllocateReg(ArgGPRs)) { + // The second half can also be passed via register. + State.addLoc( + CCValAssign::getReg(ValNo2, ValVT2, Reg, LocVT2, CCValAssign::Full)); + } else { + // The second half is passed via the stack, without additional alignment. + State.addLoc(CCValAssign::getMem( + ValNo2, ValVT2, State.AllocateStack(GRLenInBytes, Align(GRLenInBytes)), + LocVT2, CCValAssign::Full)); + } + return false; +} + // Implements the LoongArch calling convention. Returns true upon failure. -static bool CC_LoongArch(unsigned ValNo, MVT ValVT, - CCValAssign::LocInfo LocInfo, CCState &State) { - // Allocate to a register if possible. +static bool CC_LoongArch(const DataLayout &DL, LoongArchABI::ABI ABI, + unsigned ValNo, MVT ValVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State, bool IsFixed, bool IsRet, + Type *OrigTy) { + unsigned GRLen = DL.getLargestLegalIntTypeSizeInBits(); + assert((GRLen == 32 || GRLen == 64) && "Unspport GRLen"); + MVT GRLenVT = GRLen == 32 ? MVT::i32 : MVT::i64; + MVT LocVT = ValVT; + + // Any return value split into more than two values can't be returned + // directly. + if (IsRet && ValNo > 1) + return true; + + // If passing a variadic argument, or if no floating-point argument registers + // are available. + bool UseGPRForFloat = true; + + switch (ABI) { + default: + llvm_unreachable("Unexpected ABI"); + case LoongArchABI::ABI_ILP32S: + case LoongArchABI::ABI_LP64S: + case LoongArchABI::ABI_ILP32F: + case LoongArchABI::ABI_LP64F: + report_fatal_error("Undefined ABI"); + break; + case LoongArchABI::ABI_ILP32D: + case LoongArchABI::ABI_LP64D: + UseGPRForFloat = !IsFixed; + break; + } + + // FPR32 and FPR64 alias each other. + if (State.getFirstUnallocated(ArgFPR32s) == array_lengthof(ArgFPR32s)) + UseGPRForFloat = true; + + if (UseGPRForFloat && ValVT == MVT::f32) { + LocVT = GRLenVT; + LocInfo = CCValAssign::BCvt; + } else if (UseGPRForFloat && GRLen == 64 && ValVT == MVT::f64) { + LocVT = MVT::i64; + LocInfo = CCValAssign::BCvt; + } else if (UseGPRForFloat && GRLen == 32 && ValVT == MVT::f64) { + // TODO: Handle passing f64 on LA32 with D feature. + report_fatal_error("Passing f64 with GPR on LA32 is undefined"); + } + + // If this is a variadic argument, the LoongArch calling convention requires + // that it is assigned an 'even' or 'aligned' register if it has (2*GRLen)/8 + // byte alignment. An aligned register should be used regardless of whether + // the original argument was split during legalisation or not. The argument + // will not be passed by registers if the original type is larger than + // 2*GRLen, so the register alignment rule does not apply. + unsigned TwoGRLenInBytes = (2 * GRLen) / 8; + if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoGRLenInBytes && + DL.getTypeAllocSize(OrigTy) == TwoGRLenInBytes) { + unsigned RegIdx = State.getFirstUnallocated(ArgGPRs); + // Skip 'odd' register if necessary. + if (RegIdx != array_lengthof(ArgGPRs) && RegIdx % 2 == 1) + State.AllocateReg(ArgGPRs); + } + + SmallVectorImpl &PendingLocs = State.getPendingLocs(); + SmallVectorImpl &PendingArgFlags = + State.getPendingArgFlags(); + + assert(PendingLocs.size() == PendingArgFlags.size() && + "PendingLocs and PendingArgFlags out of sync"); + + // Split arguments might be passed indirectly, so keep track of the pending + // values. + if (ValVT.isScalarInteger() && (ArgFlags.isSplit() || !PendingLocs.empty())) { + LocVT = GRLenVT; + LocInfo = CCValAssign::Indirect; + PendingLocs.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + PendingArgFlags.push_back(ArgFlags); + if (!ArgFlags.isSplitEnd()) { + return false; + } + } + + // If the split argument only had two elements, it should be passed directly + // in registers or on the stack. + if (ValVT.isScalarInteger() && ArgFlags.isSplitEnd() && + PendingLocs.size() <= 2) { + assert(PendingLocs.size() == 2 && "Unexpected PendingLocs.size()"); + // Apply the normal calling convention rules to the first half of the + // split argument. + CCValAssign VA = PendingLocs[0]; + ISD::ArgFlagsTy AF = PendingArgFlags[0]; + PendingLocs.clear(); + PendingArgFlags.clear(); + return CC_LoongArchAssign2GRLen(GRLen, State, VA, AF, ValNo, ValVT, LocVT, + ArgFlags); + } + + // Allocate to a register if possible, or else a stack slot. Register Reg; + unsigned StoreSizeBytes = GRLen / 8; + Align StackAlign = Align(GRLen / 8); - if (ValVT == MVT::f32) + if (ValVT == MVT::f32 && !UseGPRForFloat) Reg = State.AllocateReg(ArgFPR32s); - else if (ValVT == MVT::f64) + else if (ValVT == MVT::f64 && !UseGPRForFloat) Reg = State.AllocateReg(ArgFPR64s); else Reg = State.AllocateReg(ArgGPRs); + + unsigned StackOffset = + Reg ? 0 : State.AllocateStack(StoreSizeBytes, StackAlign); + + // If we reach this point and PendingLocs is non-empty, we must be at the + // end of a split argument that must be passed indirectly. + if (!PendingLocs.empty()) { + assert(ArgFlags.isSplitEnd() && "Expected ArgFlags.isSplitEnd()"); + assert(PendingLocs.size() > 2 && "Unexpected PendingLocs.size()"); + for (auto &It : PendingLocs) { + if (Reg) + It.convertToReg(Reg); + else + It.convertToMem(StackOffset); + State.addLoc(It); + } + PendingLocs.clear(); + PendingArgFlags.clear(); + return false; + } + assert((!UseGPRForFloat || LocVT == GRLenVT) && + "Expected an GRLenVT at this stage"); + if (Reg) { - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, ValVT, LocInfo)); + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; } - // TODO: Handle arguments passed without register. - return true; + // When a floating-point value is passed on the stack, no bit-conversion is + // needed. + if (ValVT.isFloatingPoint()) { + LocVT = ValVT; + LocInfo = CCValAssign::Full; + } + + State.addLoc(CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo)); + return false; } void LoongArchTargetLowering::analyzeInputArgs( - CCState &CCInfo, const SmallVectorImpl &Ins, + MachineFunction &MF, CCState &CCInfo, + const SmallVectorImpl &Ins, bool IsRet, LoongArchCCAssignFn Fn) const { + FunctionType *FType = MF.getFunction().getFunctionType(); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { MVT ArgVT = Ins[i].VT; - - if (Fn(i, ArgVT, CCValAssign::Full, CCInfo)) { + Type *ArgTy = nullptr; + if (IsRet) + ArgTy = FType->getReturnType(); + else if (Ins[i].isOrigArg()) + ArgTy = FType->getParamType(Ins[i].getOrigArgIndex()); + LoongArchABI::ABI ABI = + MF.getSubtarget().getTargetABI(); + if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Ins[i].Flags, + CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) { LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << '\n'); llvm_unreachable(""); @@ -862,12 +1043,16 @@ } void LoongArchTargetLowering::analyzeOutputArgs( - CCState &CCInfo, const SmallVectorImpl &Outs, - LoongArchCCAssignFn Fn) const { + MachineFunction &MF, CCState &CCInfo, + const SmallVectorImpl &Outs, bool IsRet, + CallLoweringInfo *CLI, LoongArchCCAssignFn Fn) const { for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT ArgVT = Outs[i].VT; - - if (Fn(i, ArgVT, CCValAssign::Full, CCInfo)) { + Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr; + LoongArchABI::ABI ABI = + MF.getSubtarget().getTargetABI(); + if (Fn(MF.getDataLayout(), ABI, i, ArgVT, CCValAssign::Full, Outs[i].Flags, + CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"); llvm_unreachable(""); @@ -875,17 +1060,85 @@ } } +// Convert Val to a ValVT. Should not be called for CCValAssign::Indirect +// values. +static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val, + const CCValAssign &VA, const SDLoc &DL) { + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unexpected CCValAssign::LocInfo"); + case CCValAssign::Full: + case CCValAssign::Indirect: + break; + case CCValAssign::BCvt: + if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) + Val = DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, Val); + else + Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); + break; + } + return Val; +} + static SDValue unpackFromRegLoc(SelectionDAG &DAG, SDValue Chain, const CCValAssign &VA, const SDLoc &DL, const LoongArchTargetLowering &TLI) { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); EVT LocVT = VA.getLocVT(); + SDValue Val; const TargetRegisterClass *RC = TLI.getRegClassFor(LocVT.getSimpleVT()); Register VReg = RegInfo.createVirtualRegister(RC); RegInfo.addLiveIn(VA.getLocReg(), VReg); + Val = DAG.getCopyFromReg(Chain, DL, VReg, LocVT); - return DAG.getCopyFromReg(Chain, DL, VReg, LocVT); + return convertLocVTToValVT(DAG, Val, VA, DL); +} + +// The caller is responsible for loading the full value if the argument is +// passed with CCValAssign::Indirect. +static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain, + const CCValAssign &VA, const SDLoc &DL) { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + EVT ValVT = VA.getValVT(); + int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(), + /*IsImmutable=*/true); + SDValue FIN = DAG.getFrameIndex( + FI, MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0))); + + ISD::LoadExtType ExtType; + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unexpected CCValAssign::LocInfo"); + case CCValAssign::Full: + case CCValAssign::Indirect: + case CCValAssign::BCvt: + ExtType = ISD::NON_EXTLOAD; + break; + } + return DAG.getExtLoad( + ExtType, DL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), ValVT); +} + +static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val, + const CCValAssign &VA, const SDLoc &DL) { + EVT LocVT = VA.getLocVT(); + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unexpected CCValAssign::LocInfo"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) + Val = DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Val); + else + Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val); + break; + } + return Val; } // Transform physical registers into virtual registers. @@ -903,18 +1156,56 @@ break; } + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - analyzeInputArgs(CCInfo, Ins, CC_LoongArch); + analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false, CC_LoongArch); + + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDValue ArgValue; + if (VA.isRegLoc()) + ArgValue = unpackFromRegLoc(DAG, Chain, VA, DL, *this); + else + ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL); + if (VA.getLocInfo() == CCValAssign::Indirect) { + // If the original argument was split and passed by reference, we need to + // load all parts of it here (using the same address). + InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, + MachinePointerInfo())); + unsigned ArgIndex = Ins[i].OrigArgIndex; + unsigned ArgPartOffset = Ins[i].PartOffset; + assert(ArgPartOffset == 0); + while (i + 1 != e && Ins[i + 1].OrigArgIndex == ArgIndex) { + CCValAssign &PartVA = ArgLocs[i + 1]; + unsigned PartOffset = Ins[i + 1].PartOffset - ArgPartOffset; + SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); + SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue, Offset); + InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address, + MachinePointerInfo())); + ++i; + } + continue; + } + InVals.push_back(ArgValue); + } - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - InVals.push_back(unpackFromRegLoc(DAG, Chain, ArgLocs[i], DL, *this)); + if (IsVarArg) { + // TODO: Support vararg. + report_fatal_error("Not support vararg"); + } return Chain; } +static Align getPrefTypeAlign(EVT VT, SelectionDAG &DAG) { + return DAG.getDataLayout().getPrefTypeAlign( + VT.getTypeForEVT(*DAG.getContext())); +} + // Lower a call to a callseq_start + CALL + callseq_end chain, and add input // and output parameter nodes. SDValue @@ -930,49 +1221,128 @@ CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; EVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT GRLenVT = Subtarget.getGRLenVT(); CLI.IsTailCall = false; - if (IsVarArg) - report_fatal_error("LowerCall with varargs not implemented"); - MachineFunction &MF = DAG.getMachineFunction(); // Analyze the operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - analyzeOutputArgs(ArgCCInfo, Outs, CC_LoongArch); + analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI, CC_LoongArch); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = ArgCCInfo.getNextStackOffset(); - for (auto &Arg : Outs) { - if (!Arg.Flags.isByVal()) + // Create local copies for byval args. + SmallVector ByValArgs; + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + ISD::ArgFlagsTy Flags = Outs[i].Flags; + if (!Flags.isByVal()) continue; - report_fatal_error("Passing arguments byval not implemented"); + + SDValue Arg = OutVals[i]; + unsigned Size = Flags.getByValSize(); + Align Alignment = Flags.getNonZeroByValAlign(); + + int FI = + MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false); + SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + SDValue SizeNode = DAG.getConstant(Size, DL, GRLenVT); + + Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment, + /*IsVolatile=*/false, + /*AlwaysInline=*/false, /*isTailCall=*/false, + MachinePointerInfo(), MachinePointerInfo()); + ByValArgs.push_back(FIPtr); } Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); // Copy argument values to their designated locations. SmallVector> RegsToPass; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + SmallVector MemOpChains; + SDValue StackPtr; + for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue ArgValue = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; // Promote the value if needed. - // For now, only handle fully promoted arguments. - if (VA.getLocInfo() != CCValAssign::Full) - report_fatal_error("Unknown loc info"); + // For now, only handle fully promoted and indirect arguments. + if (VA.getLocInfo() == CCValAssign::Indirect) { + // Store the argument in a stack slot and pass its address. + Align StackAlign = + std::max(getPrefTypeAlign(Outs[i].ArgVT, DAG), + getPrefTypeAlign(ArgValue.getValueType(), DAG)); + TypeSize StoredSize = ArgValue.getValueType().getStoreSize(); + // If the original argument was split and passed by reference, we need to + // store the required parts of it here (and pass just one address). + unsigned ArgIndex = Outs[i].OrigArgIndex; + unsigned ArgPartOffset = Outs[i].PartOffset; + assert(ArgPartOffset == 0); + // Calculate the total size to store. We don't have access to what we're + // actually storing other than performing the loop and collecting the + // info. + SmallVector> Parts; + while (i + 1 != e && Outs[i + 1].OrigArgIndex == ArgIndex) { + SDValue PartValue = OutVals[i + 1]; + unsigned PartOffset = Outs[i + 1].PartOffset - ArgPartOffset; + SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); + EVT PartVT = PartValue.getValueType(); + + StoredSize += PartVT.getStoreSize(); + StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG)); + Parts.push_back(std::make_pair(PartValue, Offset)); + ++i; + } + SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign); + int FI = cast(SpillSlot)->getIndex(); + MemOpChains.push_back( + DAG.getStore(Chain, DL, ArgValue, SpillSlot, + MachinePointerInfo::getFixedStack(MF, FI))); + for (const auto &Part : Parts) { + SDValue PartValue = Part.first; + SDValue PartOffset = Part.second; + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset); + MemOpChains.push_back( + DAG.getStore(Chain, DL, PartValue, Address, + MachinePointerInfo::getFixedStack(MF, FI))); + } + ArgValue = SpillSlot; + } else { + ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL); + } + + // Use local copy if it is a byval arg. + if (Flags.isByVal()) + ArgValue = ByValArgs[j++]; if (VA.isRegLoc()) { // Queue up the argument copies and emit them at the end. RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); } else { - report_fatal_error("Passing arguments via the stack not implemented"); + assert(VA.isMemLoc() && "Argument not register or memory"); + + // Work out the address of the stack slot. + if (!StackPtr.getNode()) + StackPtr = DAG.getCopyFromReg(Chain, DL, LoongArch::R3, PtrVT); + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, + DAG.getIntPtrConstant(VA.getLocMemOffset(), DL)); + + // Emit the store. + MemOpChains.push_back( + DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo())); } } + // Join the stores, which are independent of one another. + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + SDValue Glue; // Build a sequence of copy-to-reg nodes, chained and glued together. @@ -1025,17 +1395,20 @@ // Assign locations to each value returned by this call. SmallVector RVLocs; CCState RetCCInfo(CallConv, IsVarArg, MF, RVLocs, *DAG.getContext()); - analyzeInputArgs(RetCCInfo, Ins, CC_LoongArch); + analyzeInputArgs(MF, RetCCInfo, Ins, /*IsRet=*/true, CC_LoongArch); // Copy all of the result registers out of their specified physreg. for (auto &VA : RVLocs) { // Copy the value out. SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue); + // Glue the RetValue to the end of the call sequence. Chain = RetValue.getValue(1); Glue = RetValue.getValue(2); - InVals.push_back(Chain.getValue(0)); + RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL); + + InVals.push_back(RetValue); } return Chain; @@ -1044,9 +1417,18 @@ bool LoongArchTargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { - // Any return value split in to more than two values can't be returned - // directly. - return Outs.size() <= 2; + SmallVector RVLocs; + CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); + + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + LoongArchABI::ABI ABI = + MF.getSubtarget().getTargetABI(); + if (CC_LoongArch(MF.getDataLayout(), ABI, i, Outs[i].VT, CCValAssign::Full, + Outs[i].Flags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, + nullptr)) + return false; + } + return true; } SDValue LoongArchTargetLowering::LowerReturn( @@ -1061,7 +1443,8 @@ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - analyzeOutputArgs(CCInfo, Outs, CC_LoongArch); + analyzeOutputArgs(DAG.getMachineFunction(), CCInfo, Outs, /*IsRet=*/true, + nullptr, CC_LoongArch); SDValue Glue; SmallVector RetOps(1, Chain); @@ -1072,7 +1455,8 @@ assert(VA.isRegLoc() && "Can only return in registers!"); // Handle a 'normal' return. - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVals[i], Glue); + SDValue Val = convertValVTToLocVT(DAG, OutVals[i], VA, DL); + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Val, Glue); // Guarantee that all emitted copies are stuck together. Glue = Chain.getValue(1); diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll @@ -0,0 +1,529 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+d --target-abi=lp64d < %s \ +; RUN: | FileCheck %s + +;; Check that on LA64, i128 is passed in a pair of GPRs. +define i64 @callee_i128_in_regs(i64 %a, i128 %b) nounwind { +; CHECK-LABEL: callee_i128_in_regs: +; CHECK: # %bb.0: +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %b_trunc = trunc i128 %b to i64 + %1 = add i64 %a, %b_trunc + ret i64 %1 +} + +define i64 @caller_i128_in_regs() nounwind { +; CHECK-LABEL: caller_i128_in_regs: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: ori $a1, $zero, 2 +; CHECK-NEXT: move $a2, $zero +; CHECK-NEXT: bl callee_i128_in_regs +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i64 @callee_i128_in_regs(i64 1, i128 2) + ret i64 %1 +} + +;; Check that the stack is used once the GPRs are exhausted. +define i64 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i64 %d, i128 %e, i64 %f, i128 %g, i64 %h) nounwind { +; CHECK-LABEL: callee_many_scalars: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $t0, $sp, 0 +; CHECK-NEXT: xor $a5, $a5, $t0 +; CHECK-NEXT: xor $a4, $a4, $a7 +; CHECK-NEXT: or $a4, $a4, $a5 +; CHECK-NEXT: bstrpick.d $a1, $a1, 15, 0 +; CHECK-NEXT: andi $a0, $a0, 255 +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: bstrpick.d $a1, $a2, 31, 0 +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: add.d $a0, $a0, $a3 +; CHECK-NEXT: sltui $a1, $a4, 1 +; CHECK-NEXT: add.d $a0, $a1, $a0 +; CHECK-NEXT: add.d $a0, $a0, $a6 +; CHECK-NEXT: ld.d $a1, $sp, 8 +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %a_ext = zext i8 %a to i64 + %b_ext = zext i16 %b to i64 + %c_ext = zext i32 %c to i64 + %1 = add i64 %a_ext, %b_ext + %2 = add i64 %1, %c_ext + %3 = add i64 %2, %d + %4 = icmp eq i128 %e, %g + %5 = zext i1 %4 to i64 + %6 = add i64 %5, %3 + %7 = add i64 %6, %f + %8 = add i64 %7, %h + ret i64 %8 +} + +define i64 @caller_many_scalars() nounwind { +; CHECK-LABEL: caller_many_scalars: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -32 +; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; CHECK-NEXT: ori $a0, $zero, 8 +; CHECK-NEXT: st.d $a0, $sp, 8 +; CHECK-NEXT: st.d $zero, $sp, 0 +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: ori $a1, $zero, 2 +; CHECK-NEXT: ori $a2, $zero, 3 +; CHECK-NEXT: ori $a3, $zero, 4 +; CHECK-NEXT: ori $a4, $zero, 5 +; CHECK-NEXT: ori $a6, $zero, 6 +; CHECK-NEXT: ori $a7, $zero, 7 +; CHECK-NEXT: move $a5, $zero +; CHECK-NEXT: bl callee_many_scalars +; CHECK-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 32 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i64 @callee_many_scalars(i8 1, i16 2, i32 3, i64 4, i128 5, i64 6, i128 7, i64 8) + ret i64 %1 +} + +;; Check that i256 is passed indirectly. + +define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind { +; CHECK-LABEL: callee_large_scalars: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $a2, $a1, 24 +; CHECK-NEXT: ld.d $a3, $a0, 24 +; CHECK-NEXT: xor $a2, $a3, $a2 +; CHECK-NEXT: ld.d $a3, $a1, 8 +; CHECK-NEXT: ld.d $a4, $a0, 8 +; CHECK-NEXT: xor $a3, $a4, $a3 +; CHECK-NEXT: or $a2, $a3, $a2 +; CHECK-NEXT: ld.d $a3, $a1, 16 +; CHECK-NEXT: ld.d $a4, $a0, 16 +; CHECK-NEXT: xor $a3, $a4, $a3 +; CHECK-NEXT: ld.d $a1, $a1, 0 +; CHECK-NEXT: ld.d $a0, $a0, 0 +; CHECK-NEXT: xor $a0, $a0, $a1 +; CHECK-NEXT: or $a0, $a0, $a3 +; CHECK-NEXT: or $a0, $a0, $a2 +; CHECK-NEXT: sltui $a0, $a0, 1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = icmp eq i256 %a, %b + %2 = zext i1 %1 to i64 + ret i64 %2 +} + +define i64 @caller_large_scalars() nounwind { +; CHECK-LABEL: caller_large_scalars: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -80 +; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; CHECK-NEXT: ori $a0, $zero, 2 +; CHECK-NEXT: st.d $a0, $sp, 0 +; CHECK-NEXT: st.d $zero, $sp, 24 +; CHECK-NEXT: st.d $zero, $sp, 16 +; CHECK-NEXT: st.d $zero, $sp, 8 +; CHECK-NEXT: st.d $zero, $sp, 56 +; CHECK-NEXT: st.d $zero, $sp, 48 +; CHECK-NEXT: st.d $zero, $sp, 40 +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: st.d $a0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 +; CHECK-NEXT: addi.d $a1, $sp, 0 +; CHECK-NEXT: bl callee_large_scalars +; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 80 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i64 @callee_large_scalars(i256 1, i256 2) + ret i64 %1 +} + +;; Check that arguments larger than 2*GRLen are handled correctly when their +;; address is passed on the stack rather than in memory. + +;; Must keep define on a single line due to an update_llc_test_checks.py limitation +define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i256 %h, i64 %i, i256 %j) nounwind { +; CHECK-LABEL: callee_large_scalars_exhausted_regs: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $a0, $sp, 8 +; CHECK-NEXT: ld.d $a1, $a0, 24 +; CHECK-NEXT: ld.d $a2, $a7, 24 +; CHECK-NEXT: xor $a1, $a2, $a1 +; CHECK-NEXT: ld.d $a2, $a0, 8 +; CHECK-NEXT: ld.d $a3, $a7, 8 +; CHECK-NEXT: xor $a2, $a3, $a2 +; CHECK-NEXT: or $a1, $a2, $a1 +; CHECK-NEXT: ld.d $a2, $a0, 16 +; CHECK-NEXT: ld.d $a3, $a7, 16 +; CHECK-NEXT: xor $a2, $a3, $a2 +; CHECK-NEXT: ld.d $a0, $a0, 0 +; CHECK-NEXT: ld.d $a3, $a7, 0 +; CHECK-NEXT: xor $a0, $a3, $a0 +; CHECK-NEXT: or $a0, $a0, $a2 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: sltui $a0, $a0, 1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = icmp eq i256 %h, %j + %2 = zext i1 %1 to i64 + ret i64 %2 +} + +define i64 @caller_large_scalars_exhausted_regs() nounwind { +; CHECK-LABEL: caller_large_scalars_exhausted_regs: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $a0, $sp, 16 +; CHECK-NEXT: st.d $a0, $sp, 8 +; CHECK-NEXT: ori $a0, $zero, 9 +; CHECK-NEXT: st.d $a0, $sp, 0 +; CHECK-NEXT: ori $a0, $zero, 10 +; CHECK-NEXT: st.d $a0, $sp, 16 +; CHECK-NEXT: st.d $zero, $sp, 40 +; CHECK-NEXT: st.d $zero, $sp, 32 +; CHECK-NEXT: st.d $zero, $sp, 24 +; CHECK-NEXT: st.d $zero, $sp, 72 +; CHECK-NEXT: st.d $zero, $sp, 64 +; CHECK-NEXT: st.d $zero, $sp, 56 +; CHECK-NEXT: ori $a0, $zero, 8 +; CHECK-NEXT: st.d $a0, $sp, 48 +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: ori $a1, $zero, 2 +; CHECK-NEXT: ori $a2, $zero, 3 +; CHECK-NEXT: ori $a3, $zero, 4 +; CHECK-NEXT: ori $a4, $zero, 5 +; CHECK-NEXT: ori $a5, $zero, 6 +; CHECK-NEXT: ori $a6, $zero, 7 +; CHECK-NEXT: addi.d $a7, $sp, 48 +; CHECK-NEXT: bl callee_large_scalars_exhausted_regs +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i64 @callee_large_scalars_exhausted_regs( + i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i256 8, i64 9, + i256 10) + ret i64 %1 +} + +;; Check large struct arguments, which are passed byval + +%struct.large = type { i64, i64, i64, i64 } + +define i64 @callee_large_struct(ptr byval(%struct.large) align 8 %a) nounwind { +; CHECK-LABEL: callee_large_struct: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $a1, $a0, 24 +; CHECK-NEXT: ld.d $a0, $a0, 0 +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = getelementptr inbounds %struct.large, ptr %a, i64 0, i32 0 + %2 = getelementptr inbounds %struct.large, ptr %a, i64 0, i32 3 + %3 = load i64, ptr %1 + %4 = load i64, ptr %2 + %5 = add i64 %3, %4 + ret i64 %5 +} + +define i64 @caller_large_struct() nounwind { +; CHECK-LABEL: caller_large_struct: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -80 +; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: st.d $a0, $sp, 40 +; CHECK-NEXT: st.d $a0, $sp, 8 +; CHECK-NEXT: ori $a0, $zero, 2 +; CHECK-NEXT: st.d $a0, $sp, 48 +; CHECK-NEXT: st.d $a0, $sp, 16 +; CHECK-NEXT: ori $a0, $zero, 3 +; CHECK-NEXT: st.d $a0, $sp, 56 +; CHECK-NEXT: st.d $a0, $sp, 24 +; CHECK-NEXT: ori $a0, $zero, 4 +; CHECK-NEXT: st.d $a0, $sp, 64 +; CHECK-NEXT: st.d $a0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 8 +; CHECK-NEXT: bl callee_large_struct +; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 80 +; CHECK-NEXT: jirl $zero, $ra, 0 + %ls = alloca %struct.large, align 8 + %a = getelementptr inbounds %struct.large, ptr %ls, i64 0, i32 0 + store i64 1, ptr %a + %b = getelementptr inbounds %struct.large, ptr %ls, i64 0, i32 1 + store i64 2, ptr %b + %c = getelementptr inbounds %struct.large, ptr %ls, i64 0, i32 2 + store i64 3, ptr %c + %d = getelementptr inbounds %struct.large, ptr %ls, i64 0, i32 3 + store i64 4, ptr %d + %1 = call i64 @callee_large_struct(ptr byval(%struct.large) align 8 %ls) + ret i64 %1 +} + +;; Check return scalar which size is 2*GRLen. + +define i128 @callee_small_scalar_ret() nounwind { +; CHECK-LABEL: callee_small_scalar_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $a0, $zero, -1 +; CHECK-NEXT: move $a1, $a0 +; CHECK-NEXT: jirl $zero, $ra, 0 + ret i128 -1 +} + +define i64 @caller_small_scalar_ret() nounwind { +; CHECK-LABEL: caller_small_scalar_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: bl callee_small_scalar_ret +; CHECK-NEXT: addi.w $a2, $zero, -1 +; CHECK-NEXT: xor $a1, $a1, $a2 +; CHECK-NEXT: addi.w $a2, $zero, -2 +; CHECK-NEXT: xor $a0, $a0, $a2 +; CHECK-NEXT: or $a0, $a0, $a1 +; CHECK-NEXT: sltui $a0, $a0, 1 +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i128 @callee_small_scalar_ret() + %2 = icmp eq i128 -2, %1 + %3 = zext i1 %2 to i64 + ret i64 %3 +} + +;; Check return struct which size is 2*GRLen. + +%struct.small = type { i64, ptr } + +define %struct.small @callee_small_struct_ret() nounwind { +; CHECK-LABEL: callee_small_struct_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: move $a1, $zero +; CHECK-NEXT: jirl $zero, $ra, 0 + ret %struct.small { i64 1, ptr null } +} + +define i64 @caller_small_struct_ret() nounwind { +; CHECK-LABEL: caller_small_struct_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: bl callee_small_struct_ret +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call %struct.small @callee_small_struct_ret() + %2 = extractvalue %struct.small %1, 0 + %3 = extractvalue %struct.small %1, 1 + %4 = ptrtoint ptr %3 to i64 + %5 = add i64 %2, %4 + ret i64 %5 +} + +;; Check return scalar which size is more than 2*GRLen. + +define i256 @callee_large_scalar_ret() nounwind { +; CHECK-LABEL: callee_large_scalar_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.w $a1, $zero, -1 +; CHECK-NEXT: st.d $a1, $a0, 24 +; CHECK-NEXT: st.d $a1, $a0, 16 +; CHECK-NEXT: st.d $a1, $a0, 8 +; CHECK-NEXT: lu12i.w $a1, -30141 +; CHECK-NEXT: ori $a1, $a1, 747 +; CHECK-NEXT: st.d $a1, $a0, 0 +; CHECK-NEXT: jirl $zero, $ra, 0 + ret i256 -123456789 +} + +define void @caller_large_scalar_ret() nounwind { +; CHECK-LABEL: caller_large_scalar_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -48 +; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: bl callee_large_scalar_ret +; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 48 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i256 @callee_large_scalar_ret() + ret void +} + +;; Check return struct which size is more than 2*GRLen. + +define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result) nounwind { +; CHECK-LABEL: callee_large_struct_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: ori $a1, $zero, 4 +; CHECK-NEXT: st.w $a1, $a0, 24 +; CHECK-NEXT: ori $a1, $zero, 3 +; CHECK-NEXT: st.w $a1, $a0, 16 +; CHECK-NEXT: ori $a1, $zero, 2 +; CHECK-NEXT: st.w $a1, $a0, 8 +; CHECK-NEXT: st.w $zero, $a0, 28 +; CHECK-NEXT: st.w $zero, $a0, 20 +; CHECK-NEXT: st.w $zero, $a0, 12 +; CHECK-NEXT: st.w $zero, $a0, 4 +; CHECK-NEXT: ori $a1, $zero, 1 +; CHECK-NEXT: st.w $a1, $a0, 0 +; CHECK-NEXT: jirl $zero, $ra, 0 + %a = getelementptr inbounds %struct.large, ptr %agg.result, i64 0, i32 0 + store i64 1, ptr %a, align 4 + %b = getelementptr inbounds %struct.large, ptr %agg.result, i64 0, i32 1 + store i64 2, ptr %b, align 4 + %c = getelementptr inbounds %struct.large, ptr %agg.result, i64 0, i32 2 + store i64 3, ptr %c, align 4 + %d = getelementptr inbounds %struct.large, ptr %agg.result, i64 0, i32 3 + store i64 4, ptr %d, align 4 + ret void +} + +define i64 @caller_large_struct_ret() nounwind { +; CHECK-LABEL: caller_large_struct_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -48 +; CHECK-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $a0, $sp, 8 +; CHECK-NEXT: bl callee_large_struct_ret +; CHECK-NEXT: ld.d $a0, $sp, 32 +; CHECK-NEXT: ld.d $a1, $sp, 8 +; CHECK-NEXT: add.d $a0, $a1, $a0 +; CHECK-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 48 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = alloca %struct.large + call void @callee_large_struct_ret(ptr sret(%struct.large) %1) + %2 = getelementptr inbounds %struct.large, ptr %1, i64 0, i32 0 + %3 = load i64, ptr %2 + %4 = getelementptr inbounds %struct.large, ptr %1, i64 0, i32 3 + %5 = load i64, ptr %4 + %6 = add i64 %3, %5 + ret i64 %6 +} + +;; Check pass floating-point arguments whith FPRs. + +define i64 @callee_float_in_fpr(i64 %a, float %b, double %c) nounwind { +; CHECK-LABEL: callee_float_in_fpr: +; CHECK: # %bb.0: +; CHECK-NEXT: ftintrz.l.s $fa0, $fa0 +; CHECK-NEXT: movfr2gr.d $a1, $fa0 +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: ftintrz.l.d $fa0, $fa1 +; CHECK-NEXT: movfr2gr.d $a1, $fa0 +; CHECK-NEXT: add.d $a0, $a0, $a1 +; CHECK-NEXT: jirl $zero, $ra, 0 + %b_fptosi = fptosi float %b to i64 + %c_fptosi = fptosi double %c to i64 + %1 = add i64 %a, %b_fptosi + %2 = add i64 %1, %c_fptosi + ret i64 %2 +} + +define i64 @caller_float_in_fpr() nounwind { +; CHECK-LABEL: caller_float_in_fpr: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: ori $a0, $zero, 1 +; CHECK-NEXT: movgr2fr.w $fa0, $zero +; CHECK-NEXT: movgr2fr.d $fa1, $zero +; CHECK-NEXT: bl callee_float_in_fpr +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i64 @callee_float_in_fpr(i64 1, float 0.0, double 0.0) + ret i64 %1 +} + +;; Check that the GPR is used once the FPRs are exhausted. + +;; Must keep define on a single line due to an update_llc_test_checks.py limitation. +define i64 @callee_double_in_gpr_exhausted_fprs(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) nounwind { +; CHECK-LABEL: callee_double_in_gpr_exhausted_fprs: +; CHECK: # %bb.0: +; CHECK-NEXT: ftintrz.l.d $fa0, $fa7 +; CHECK-NEXT: movfr2gr.d $a1, $fa0 +; CHECK-NEXT: movgr2fr.d $fa0, $a0 +; CHECK-NEXT: ftintrz.l.d $fa0, $fa0 +; CHECK-NEXT: movfr2gr.d $a0, $fa0 +; CHECK-NEXT: add.d $a0, $a1, $a0 +; CHECK-NEXT: jirl $zero, $ra, 0 + %h_fptosi = fptosi double %h to i64 + %i_fptosi = fptosi double %i to i64 + %1 = add i64 %h_fptosi, %i_fptosi + ret i64 %1 +} + +define i64 @caller_double_in_gpr_exhausted_fprs() nounwind { +; CHECK-LABEL: caller_double_in_gpr_exhausted_fprs: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: pcalau12i $a0, .LCPI21_0 +; CHECK-NEXT: addi.d $a0, $a0, .LCPI21_0 +; CHECK-NEXT: pcalau12i $a1, .LCPI21_1 +; CHECK-NEXT: addi.d $a1, $a1, .LCPI21_1 +; CHECK-NEXT: pcalau12i $a2, .LCPI21_2 +; CHECK-NEXT: addi.d $a2, $a2, .LCPI21_2 +; CHECK-NEXT: pcalau12i $a3, .LCPI21_3 +; CHECK-NEXT: addi.d $a3, $a3, .LCPI21_3 +; CHECK-NEXT: pcalau12i $a4, .LCPI21_4 +; CHECK-NEXT: addi.d $a4, $a4, .LCPI21_4 +; CHECK-NEXT: pcalau12i $a5, .LCPI21_5 +; CHECK-NEXT: addi.d $a5, $a5, .LCPI21_5 +; CHECK-NEXT: addi.d $a6, $zero, 1 +; CHECK-NEXT: movgr2fr.d $fa0, $a6 +; CHECK-NEXT: ffint.d.l $fa0, $fa0 +; CHECK-NEXT: fld.d $fa1, $a5, 0 +; CHECK-NEXT: fld.d $fa2, $a4, 0 +; CHECK-NEXT: fld.d $fa3, $a3, 0 +; CHECK-NEXT: fld.d $fa4, $a2, 0 +; CHECK-NEXT: fld.d $fa5, $a1, 0 +; CHECK-NEXT: fld.d $fa6, $a0, 0 +; CHECK-NEXT: pcalau12i $a0, .LCPI21_6 +; CHECK-NEXT: addi.d $a0, $a0, .LCPI21_6 +; CHECK-NEXT: fld.d $fa7, $a0, 0 +; CHECK-NEXT: ori $a0, $zero, 0 +; CHECK-NEXT: lu32i.d $a0, 131072 +; CHECK-NEXT: lu52i.d $a0, $a0, 1026 +; CHECK-NEXT: bl callee_double_in_gpr_exhausted_fprs +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call i64 @callee_double_in_gpr_exhausted_fprs( + double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, + double 7.0, double 8.0, double 9.0) + ret i64 %1 +} + +;; Check double ret. + +define double @callee_double_ret() nounwind { +; CHECK-LABEL: callee_double_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $a0, $zero, 1 +; CHECK-NEXT: movgr2fr.d $fa0, $a0 +; CHECK-NEXT: ffint.d.l $fa0, $fa0 +; CHECK-NEXT: jirl $zero, $ra, 0 + ret double 1.0 +} + +define i64 @caller_double_ret() nounwind { +; CHECK-LABEL: caller_double_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -16 +; CHECK-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: bl callee_double_ret +; CHECK-NEXT: movfr2gr.d $a0, $fa0 +; CHECK-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 16 +; CHECK-NEXT: jirl $zero, $ra, 0 + %1 = call double @callee_double_ret() + %2 = bitcast double %1 to i64 + ret i64 %2 +}