Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -163,6 +163,10 @@ "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo>, CCIf<"static_cast" + "(State.getMachineFunction().getSubtarget()).getGeneration() >= " + "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", + CCDelegateTo>, + CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo> Index: lib/Target/AMDGPU/AMDGPUFrameLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -33,10 +33,6 @@ /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - - bool hasFP(const MachineFunction &MF) const override { - return false; - } }; } // end namespace llvm Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -20,6 +20,7 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "R600MachineFunctionInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -30,7 +30,9 @@ void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {} + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + ST(ST), + AMDGPUASI(ST.getAMDGPUAS()) {} // FIXME: This behaves strangely. If, for example, you have 32 load + stores, // the first 16 loads will be interleaved with the stores, and the next 16 will Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -82,6 +82,22 @@ def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>; def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>; +def callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOutGlue] +>; + +def callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue] +>; + +def AMDGPUcall : SDNode<"AMDGPUISD::CALL", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic] +>; + def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP", SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>, [SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue] Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -121,6 +121,9 @@ MCOp = MCOperand::createExpr(Expr); return true; } + case MachineOperand::MO_RegisterMask: + // Regmasks are like implicit defs. + return false; } } Index: lib/Target/AMDGPU/AMDGPURegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.cpp +++ lib/Target/AMDGPU/AMDGPURegisterInfo.cpp @@ -56,6 +56,20 @@ } } +const MCPhysReg * +SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const { + // FIXME + static MCPhysReg Regs[2]; + + const SIMachineFunctionInfo *MFI = MF->getInfo(); + assert(!MFI->isEntryFunction()); + + Regs[0] = MFI->getFrameOffsetReg(); + Regs[1] = AMDGPU::NoRegister; + + return Regs; +} + const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { switch (CC) { Index: lib/Target/AMDGPU/AMDGPUTargetMachine.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -69,6 +69,9 @@ return -1; return 0; } + + LLVM_READONLY + bool enableFunctionCalls() const; }; //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -123,6 +123,12 @@ cl::init(false), cl::Hidden); +static cl::opt EnableAMDGPUFunctionCalls( + "amdgpu-function-calls", + cl::Hidden, + cl::desc("Enable AMDGPU function call support"), + cl::init(false)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -268,6 +274,11 @@ AMDGPUTargetMachine::~AMDGPUTargetMachine() = default; +bool AMDGPUTargetMachine::enableFunctionCalls() const { + return EnableAMDGPUFunctionCalls && + getTargetTriple().getArch() == Triple::amdgcn; +} + StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const { Attribute GPUAttr = F.getFnAttribute("target-cpu"); return GPUAttr.hasAttribute(Attribute::None) ? Index: lib/Target/AMDGPU/R600FrameLowering.h =================================================================== --- lib/Target/AMDGPU/R600FrameLowering.h +++ lib/Target/AMDGPU/R600FrameLowering.h @@ -27,6 +27,10 @@ MachineBasicBlock &MBB) const override {} int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; + + bool hasFP(const MachineFunction &MF) const override { + return false; + } }; } // end namespace llvm Index: lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.h +++ lib/Target/AMDGPU/SIFrameLowering.h @@ -39,6 +39,11 @@ MachineFunction &MF, RegScavenger *RS = nullptr) const override; + MachineBasicBlock::iterator + eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; + private: void emitFlatScratchInit(const SISubtarget &ST, MachineFunction &MF, Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -575,6 +575,41 @@ } } +MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + int64_t Amount = I->getOperand(0).getImm(); + if (Amount == 0) + return MBB.erase(I); + + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const DebugLoc &DL = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; + + const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); + if (!TFI->hasReservedCallFrame(MF)) { + unsigned Align = getStackAlignment(); + + Amount = alignTo(Amount, Align); + assert(isUInt<32>(Amount) && "exceeded stack address space size"); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned SPReg = MFI->getStackPtrOffsetReg(); + + unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32; + BuildMI(MBB, I, DL, TII->get(Op), SPReg) + .addReg(SPReg) + .addImm(Amount * ST.getWavefrontSize()); + } else if (CalleePopAmount != 0) { + llvm_unreachable("is this used?"); + } + + return MBB.erase(I); +} + void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { const SISubtarget &ST = MF.getSubtarget(); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -182,6 +182,12 @@ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + bool supportSplitCSR(MachineFunction *MF) const override; + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, @@ -198,6 +204,15 @@ const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, + const SDLoc &DL, SelectionDAG &DAG, + SmallVectorImpl &InVals, bool isThisReturn, + SDValue ThisVal) const; + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const override; + unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1193,9 +1193,13 @@ if (TM.getOptLevel() == CodeGenOpt::None) HasStackObjects = true; + // For now assume stack access is needed in any callee functions, so we need + // the scratch registers to pass in. + bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); + const SISubtarget &ST = MF.getSubtarget(); if (ST.isAmdCodeObjectV2(MF)) { - if (HasStackObjects) { + if (RequiresStackAccess) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. @@ -1204,9 +1208,23 @@ MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); - unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + if (MFI.hasCalls()) { + // If we have calls, we need to keep the frame register in a register + // that won't be clobbered by a call, so ensure it is copied somewhere. + + // This is not a problem for the scratch wave offset, because the same + // registers are reserved in all functions. + + // FIXME: Nothing is really ensuring this is a call preserved register, + // it's just selected from the end so it happens to be. + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + } else { + unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); @@ -1229,7 +1247,7 @@ // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); - if (HasStackObjects) { + if (HasStackObjects && !MFI.hasCalls()) { unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); @@ -1241,6 +1259,50 @@ } } +bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const { + const SIMachineFunctionInfo *Info = MF->getInfo(); + return !Info->isEntryFunction(); +} + +void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + +} + +void SITargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + MachineBasicBlock::iterator MBBI = Entry->begin(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AMDGPU::SReg_64RegClass.contains(*I)) + RC = &AMDGPU::SGPR_64RegClass; + else if (AMDGPU::SReg_32RegClass.contains(*I)) + RC = &AMDGPU::SGPR_32RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + Entry->addLiveIn(*I); + BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) + .addReg(*I); + + // Insert the copy-back instructions right before the terminator. + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), + TII->get(TargetOpcode::COPY), *I) + .addReg(NewVR); + } +} + SDValue SITargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, @@ -1581,6 +1643,22 @@ } // FIXME: Does sret work properly? + if (!Info->isEntryFunction()) { + const SIRegisterInfo *TRI + = static_cast(Subtarget)->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AMDGPU::SReg_64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AMDGPU::SReg_32RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i32)); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } + } // Update chain and glue. RetOps[0] = Chain; @@ -1593,6 +1671,296 @@ return DAG.getNode(Opc, DL, MVT::Other, RetOps); } +SDValue SITargetLowering::LowerCallResult( + SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg, + const SmallVectorImpl &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl &InVals, bool IsThisReturn, + SDValue ThisVal) const { + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg); + + // Assign locations to each value returned by this call. + SmallVector RVLocs; + CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + SDValue Val; + + if (VA.isRegLoc()) { + Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + } else if (VA.isMemLoc()) { + report_fatal_error("TODO: return values in memory"); + } else + llvm_unreachable("unknown argument location type"); + + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val, + DAG.getValueType(VA.getValVT())); + Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); + break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + + InVals.push_back(Val); + } + + return Chain; +} + +// The wave scratch offset register is used as the global base pointer. +SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + const AMDGPUTargetMachine &TM = + static_cast(getTargetMachine()); + if (!TM.enableFunctionCalls()) + return AMDGPUTargetLowering::LowerCall(CLI, InVals); + + SelectionDAG &DAG = CLI.DAG; + const SDLoc &DL = CLI.DL; + SmallVector &Outs = CLI.Outs; + SmallVector &OutVals = CLI.OutVals; + SmallVector &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &IsTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + bool IsSibCall = false; + bool IsThisReturn = false; + MachineFunction &MF = DAG.getMachineFunction(); + + // TODO: Implement tail calls. + IsTailCall = false; + + if (IsVarArg || MF.getTarget().Options.GuaranteedTailCallOpt) { + report_fatal_error("varargs and tail calls not implemented"); + } + + if (GlobalAddressSDNode *GA = dyn_cast(Callee)) { + // FIXME: Remove this hack for function pointer types. + const GlobalValue *GV = GA->getGlobal(); + assert(Callee.getValueType() == MVT::i32); + Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), + false, GA->getTargetFlags()); + } + + const SIMachineFunctionInfo *Info = MF.getInfo(); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector ArgLocs; + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + CCInfo.AnalyzeCallOperands(Outs, AssignFn); + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + if (IsSibCall) { + // Since we're not changing the ABI to make this a tail call, the memory + // operands are already available in the caller's incoming argument space. + NumBytes = 0; + } + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. Completely unused for non-tail calls. + int FPDiff = 0; + + SmallVector, 8> RegsToPass; + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + if (!IsSibCall) { + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); + + unsigned OffsetReg = Info->getScratchWaveOffsetReg(); + + // In the HSA case, this should be an identity copy. + SDValue ScratchRSrcReg + = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); + RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + + // TODO: Don't hardcode these registers and get from the callee function. + SDValue ScratchWaveOffsetReg + = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); + RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); + } + + // Stack pointer relative accesses are done by changing the offset SGPR. This + // is just the VGPR offset component. + SDValue StackPtr = DAG.getConstant(0, DL, MVT::i32); + + SmallVector MemOpChains; + MVT PtrVT = MVT::i32; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[realArgIdx]; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::FPExt: + Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + + SDValue DstAddr; + MachinePointerInfo DstInfo; + + unsigned LocMemOffset = VA.getLocMemOffset(); + int32_t Offset = LocMemOffset; + SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32); + PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + + if (!IsTailCall) { + SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32); + + DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); + } + + if (Outs[i].Flags.isByVal()) { + SDValue SizeNode = + DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); + SDValue Cpy = DAG.getMemcpy( + Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + /*isVol = */ false, /*AlwaysInline = */ true, + /*isTailCall = */ false, + DstInfo, MachinePointerInfo()); + + MemOpChains.push_back(Cpy); + } else { + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); + MemOpChains.push_back(Store); + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); + InFlag = Chain.getValue(1); + } + + // We don't usually want to end the call-sequence here because we would tidy + // the frame up *after* the call, however in the ABI-changing tail-call case + // we've carefully laid out the parameters so that when sp is reset they'll be + // in the correct location. + if (IsTailCall && !IsSibCall) { + Chain = DAG.getCALLSEQ_END(Chain, + DAG.getTargetConstant(NumBytes, DL, MVT::i32), + DAG.getTargetConstant(0, DL, MVT::i32), + InFlag, DL); + InFlag = Chain.getValue(1); + } + + std::vector Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (IsTailCall) { + // Each tail call may have to adjust the stack by a different amount, so + // this information must travel along with the operation for eventual + // consumption by emitEpilogue. + Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); + } + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (auto &RegToPass : RegsToPass) { + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); + } + + // Add a register mask operand representing the call-preserved registers. + + const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + // If we're doing a tall call, use a TC_RETURN here rather than an + // actual call instruction. + if (IsTailCall) { + MF.getFrameInfo().setHasTailCall(); + llvm_unreachable("not implemented"); + } + + // Returns a chain and a flag for retval copy to use. + SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops); + Chain = Call.getValue(0); + InFlag = Call.getValue(1); + + uint64_t CalleePopBytes = 0; + Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(NumBytes, DL, MVT::i32), + DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), + InFlag, DL); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, + InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue()); +} + unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) @@ -2258,6 +2626,27 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::ADJCALLSTACKUP: + case AMDGPU::ADJCALLSTACKDOWN: { + const SIMachineFunctionInfo *Info = MF->getInfo(); + MachineInstrBuilder MIB(*MF, &MI); + MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) + .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); + return BB; + } + case AMDGPU::SI_CALL: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_SWAPPC_B64), ReturnAddrReg); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); + + MI.eraseFromParent(); + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } @@ -2923,13 +3312,16 @@ SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast(Op); + const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS) + GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && + // FIXME: It isn't correct to rely on the type of the pointer. This should + // be removed when address space 0 is 64-bit. + !GV->getType()->getElementType()->isFunctionTy()) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); SDLoc DL(GSD); - const GlobalValue *GV = GSD->getGlobal(); EVT PtrVT = Op.getValueType(); if (shouldEmitFixup(GV)) Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -317,6 +317,45 @@ let DisableWQM = 1; } +// Return for returning function calls. +def SI_RETURN : SPseudoInstSI < + (outs), (ins), [], + "; return"> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; + let SchedRW = [WriteBranch]; +} + +// Return for returning function calls. +def SI_CALL : SPseudoInstSI < + (outs), (ins SSrc_b64:$src0), [(AMDGPUcall i64:$src0)], + "; call $src0"> { + let Size = 4; + let isCall = 1; + let SchedRW = [WriteBranch]; + let usesCustomInserter = 1; +} + +def ADJCALLSTACKUP : SPseudoInstSI< + (outs), (ins i32imm:$amt0, i32imm:$amt1), + [(callseq_start timm:$amt0, timm:$amt1)], + "; adjcallstackup $amt0 $amt1"> { + let Size = 8; // Worst case. (s_add_u32 + constant) + let FixedSize = 1; + let hasSideEffects = 1; + let usesCustomInserter = 1; +} + +def ADJCALLSTACKDOWN : SPseudoInstSI< + (outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_end timm:$amt1, timm:$amt2)], + "; adjcallstackdown $amt1"> { + let Size = 8; // Worst case. (s_add_u32 + constant) + let hasSideEffects = 1; + let usesCustomInserter = 1; +} + let Defs = [M0, EXEC], UseNamedOperandTable = 1 in { Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -63,6 +63,7 @@ BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -237,8 +237,15 @@ return true; } -bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const { - return MF.getFrameInfo().hasStackObjects(); +bool SIRegisterInfo::requiresFrameIndexScavenging( + const MachineFunction &MF) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + if (MFI.hasStackObjects()) + return true; + + // May need to deal with callee saved registers. + const SIMachineFunctionInfo *Info = MF.getInfo(); + return !Info->isEntryFunction(); } bool SIRegisterInfo::requiresFrameIndexReplacementScavenging( Index: lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.td +++ lib/Target/AMDGPU/SIRegisterInfo.td @@ -269,6 +269,18 @@ // Register classes used as source and destination //===----------------------------------------------------------------------===// +def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, + (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + let isAllocatable = 0; + let CopyCost = -1; +} + +def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64], 32, + (add PRIVATE_RSRC_REG)> { + let isAllocatable = 0; + let CopyCost = -1; +} + // Subset of SReg_32 without M0 for SMRD instructions and alike. // See comments in SIInstructions.td for more info. def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, Index: test/CodeGen/AMDGPU/basic-call-return.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/basic-call-return.ll @@ -0,0 +1,27 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define void @void_func_void() #2 { + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void: +define amdgpu_kernel void @test_call_void_func_void() { + call void @void_func_void() + ret void +} + +define void @void_func_void_clobber_s40_s41() #2 { + call void asm sideeffect "", "~{SGPR40_SGPR41}"() #0 + ret void +} + +define amdgpu_kernel void @test_call_void_func_void_clobber_s40_s41() { + call void @void_func_void_clobber_s40_s41() + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind noinline } Index: test/CodeGen/AMDGPU/byval-frame-setup.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -0,0 +1,235 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s + +%struct.ByValStruct = type { [4 x i32] } + +; GCN-LABEL: {{^}}void_func_byval_struct: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5{{$}} +; GCN-NOT: s32 + +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-NOT: s32 +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:16{{$}} +; GCN-NOT: s32 +define void @void_func_byval_struct(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 { +entry: + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + store volatile i32 %add, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %tmp1, 2 + store volatile i32 %add3, i32* %arrayidx2, align 4 + store volatile i32 9, i32 addrspace(1)* null, align 4 + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32 +; GCN: v_writelane_b32 + +; GCN-DAG: s_add_u32 s32, s32, 0x900{{$}} + +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}} +; GCN: v_add_i32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] +; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}} + +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN: v_add_i32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] + +; GCN: s_swappc_b64 + +; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}} + +; GCN: v_readlane_b32 +; GCN: buffer_load_dword v32, +; GCN: s_sub_u32 s32, s32, 0x900{{$}} +; GCN: s_setpc_b64 +define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 { +entry: + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 + %tmp = load volatile i32, i32* %arrayidx, align 4 + %add = add nsw i32 %tmp, 1 + store volatile i32 %add, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 + %tmp1 = load volatile i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %tmp1, 2 + call void @external_void_func_void() + store volatile i32 %add3, i32* %arrayidx2, align 4 + store volatile i32 9, i32 addrspace(1)* null, align 4 + ret void +} + +; GCN-LABEL: {{^}}call_void_func_byval_struct_func: +; GCN: s_mov_b32 s5, s32 +; GCN: s_add_u32 s32, s32, 0xa00{{$}} +; GCN: v_writelane_b32 + +; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}} +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 + +; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6 +; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6 + +; GCN-DAG: v_add_i32_e64 [[FI_ADD0:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 8, +; GCN-DAG: v_or_b32_e32 [[FI_OR0:v[0-9]+]], 4, [[FI_ADD0]] + +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24 + +; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:4 +; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:8 + +; FIXME: or fails to combine with add, so FI doesn't fold and scratch wave offset is used +; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6 +; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6 + +; GCN-DAG: v_add_i32_e64 [[FI_ADD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 24, +; GCN-DAG: v_or_b32_e32 [[FI_OR1:v[0-9]+]], 4, [[FI_ADD1]] + +; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 + + + +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12 +; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}} +; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4 + + +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:4 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:8 +; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 + + +; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24 +; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20 + +; GCN: s_swappc_b64 +; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}} + +; GCN: v_readlane_b32 + +; GCN: s_sub_u32 s32, s32, 0xa00{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @call_void_func_byval_struct_func() #0 { +entry: + %arg0 = alloca %struct.ByValStruct, align 4 + %arg1 = alloca %struct.ByValStruct, align 4 + %tmp = bitcast %struct.ByValStruct* %arg0 to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp) + %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32* %arrayidx2, align 4 + call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1) + call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1) + call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp) + ret void +} + +; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel: +; GCN: s_mov_b32 s33, s7 +; GCN: s_add_u32 s32, s33, 0xa00{{$}} + +; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 +; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 +; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 + +; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}} + +; FIXME: Fold offset +; GCN-DAG: v_or_b32_e32 [[OR_FI0:v[0-9]+]], 4, + +; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:4 +; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:8 + +; FIXME: Fold offset +; GCN-DAG: v_or_b32_e32 [[OR_FI1:v[0-9]+]], 4, + +; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12 +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8 + + +; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:8 +; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:12 +; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4 +; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}} + + + +; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:4 +; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:8 +; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28 +; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24 + + +; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:24 +; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:28 +; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16 +; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20 + + +; GCN: s_swappc_b64 +; FIXME: Dead SP modfication +; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 { +entry: + %arg0 = alloca %struct.ByValStruct, align 4 + %arg1 = alloca %struct.ByValStruct, align 4 + %tmp = bitcast %struct.ByValStruct* %arg0 to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp) + %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32* %arrayidx2, align 4 + call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1) + call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1) + call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp) + ret void +} + +; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim: +define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 { +entry: + %arg0 = alloca %struct.ByValStruct, align 4 + %arg1 = alloca %struct.ByValStruct, align 4 + %tmp = bitcast %struct.ByValStruct* %arg0 to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp) + %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1) + %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0 + store volatile i32 9, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0 + store volatile i32 13, i32* %arrayidx2, align 4 + call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1) + call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1) + call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp) + ret void +} + +declare void @external_void_func_void() #0 + +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3 + +attributes #0 = { nounwind } +attributes #1 = { noinline norecurse nounwind } +attributes #2 = { nounwind norecurse "no-frame-pointer-elim"="true" } Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -0,0 +1,527 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s +; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s + +declare void @external_void_func_i1(i1) #0 +declare void @external_void_func_i1_signext(i1 signext) #0 +declare void @external_void_func_i1_zeroext(i1 zeroext) #0 + +declare void @external_void_func_i8(i8) #0 +declare void @external_void_func_i8_signext(i8 signext) #0 +declare void @external_void_func_i8_zeroext(i8 zeroext) #0 + +declare void @external_void_func_i16(i16) #0 +declare void @external_void_func_i16_signext(i16 signext) #0 +declare void @external_void_func_i16_zeroext(i16 zeroext) #0 + +declare void @external_void_func_i32(i32) #0 +declare void @external_void_func_i64(i64) #0 + +declare void @external_void_func_f16(half) #0 +declare void @external_void_func_f32(float) #0 +declare void @external_void_func_f64(double) #0 + +declare void @external_void_func_v2i16(<2 x i16>) #0 +declare void @external_void_func_v2f16(<2 x half>) #0 + +declare void @external_void_func_v2i32(<2 x i32>) #0 +declare void @external_void_func_v3i32(<3 x i32>) #0 +declare void @external_void_func_v4i32(<4 x i32>) #0 +declare void @external_void_func_v8i32(<8 x i32>) #0 +declare void @external_void_func_v16i32(<16 x i32>) #0 +declare void @external_void_func_v32i32(<32 x i32>) #0 +declare void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 + +; return value and argument +declare i32 @external_i32_func_i32(i32) #0 + +; Structs +declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0 +declare void @external_void_func_byval_struct_i8_i32({ i8, i32 }* byval) #0 +declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* sret, { i8, i32 }* byval) #0 + +declare void @external_void_func_v16i8(<16 x i8>) #0 + + +; FIXME: Should be passing -1 +; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm: +; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD + +; MESA-DAG: s_mov_b64 s[0:1], s[36:37] + +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1@rel32@hi+4 +; GCN-DAG: v_mov_b32_e32 v0, 1{{$}} +; MESA-DAG: s_mov_b64 s[2:3], s[38:39] + +; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { + call void @external_void_func_i1(i1 true) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: +; MESA: s_mov_b32 s33, s3{{$}} +; HSA: s_mov_b32 s33, s9{{$}} + +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext@rel32@hi+4 +; GCN-NEXT: buffer_load_ubyte [[VAR:v[0-9]+]] +; HSA-NEXT: s_mov_b32 s4, s33 +; HSA-NEXT: s_mov_b32 s32, s33 + +; MESA-DAG: s_mov_b32 s4, s33{{$}} +; MESA-DAG: s_mov_b32 s32, s33{{$}} + +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1 +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { + %var = load volatile i1, i1 addrspace(1)* undef + call void @external_void_func_i1_signext(i1 %var) + ret void +} + +; FIXME: load should be scheduled before getpc +; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: +; MESA: s_mov_b32 s33, s3{{$}} + +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext@rel32@hi+4 +; GCN-NEXT: buffer_load_ubyte v0 + +; GCN-DAG: s_mov_b32 s4, s33{{$}} +; GCN-DAG: s_mov_b32 s32, s33{{$}} + +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { + %var = load volatile i1, i1 addrspace(1)* undef + call void @external_void_func_i1_zeroext(i1 %var) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: +; MESA-DAG: s_mov_b32 s33, s3{{$}} + +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 +; GCN-NEXT: v_mov_b32_e32 v0, 0x7b + +; HSA-DAG: s_mov_b32 s4, s33{{$}} +; GCN-DAG: s_mov_b32 s32, s33{{$}} + +; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { + call void @external_void_func_i8(i8 123) + ret void +} + +; FIXME: don't wait before call +; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: +; HSA-DAG: s_mov_b32 s33, s9{{$}} +; MESA-DAG: s_mov_b32 s33, s3{{$}} + +; GCN-DAG: buffer_load_sbyte v0 +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 + +; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s3 + +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { + %var = load volatile i8, i8 addrspace(1)* undef + call void @external_void_func_i8_signext(i8 %var) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: +; MESA-DAG: s_mov_b32 s33, s3{{$}} +; HSA-DAG: s_mov_b32 s33, s9{{$}} + +; GCN-DAG: buffer_load_ubyte v0 +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 + +; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s33 + +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { + %var = load volatile i8, i8 addrspace(1)* undef + call void @external_void_func_i8_zeroext(i8 %var) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm: +; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} + +; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s33 + +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { + call void @external_void_func_i16(i16 123) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: +; MESA-DAG: s_mov_b32 s33, s3{{$}} + +; GCN-DAG: buffer_load_sshort v0 +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 + +; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s33 + +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { + %var = load volatile i16, i16 addrspace(1)* undef + call void @external_void_func_i16_signext(i16 %var) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext: +; MESA-DAG: s_mov_b32 s33, s3{{$}} + + +; GCN-DAG: buffer_load_ushort v0 +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 + +; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s33 + +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { + %var = load volatile i16, i16 addrspace(1)* undef + call void @external_void_func_i16_zeroext(i16 %var) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: +; MESA-DAG: s_mov_b32 s33, s3{{$}} + +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 +; GCN: v_mov_b32_e32 v0, 42 +; GCN-DAG: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s32, s33 + +; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { + call void @external_void_func_i32(i32 42) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm: +; GCN-DAG: s_movk_i32 [[K0:s[0-9]+]], 0x7b{{$}} +; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v0, [[K0]] +; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64@rel32@lo+4 +; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64@rel32@hi+4 +; GCN-DAG: v_mov_b32_e32 v1, [[K1]] +; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { + call void @external_void_func_i64(i64 123) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm: +; VI: v_mov_b32_e32 v0, 0x4400 +; CI: v_mov_b32_e32 v0, 4.0 +; GCN-NOT: v0 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { + call void @external_void_func_f16(half 4.0) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm: +; GCN: v_mov_b32_e32 v0, 4.0 +; GCN-NOT: v0 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { + call void @external_void_func_f32(float 4.0) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm: +; GCN: v_mov_b32_e32 v0, 0{{$}} +; GCN: v_mov_b32_e32 v1, 0x40100000 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { + call void @external_void_func_f64(double 4.0) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v2i16: +; GFX9: buffer_load_dword v0 +; GFX9-NOT: v0 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { + %val = load <2 x i16>, <2 x i16> addrspace(1)* undef + call void @external_void_func_v2i16(<2 x i16> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v2f16: +; GFX9: buffer_load_dword v0 +; GFX9-NOT: v0 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* undef + call void @external_void_func_v2f16(<2 x half> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v2i32: +; GCN: buffer_load_dwordx2 v[0:1] +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { + %val = load <2 x i32>, <2 x i32> addrspace(1)* undef + call void @external_void_func_v2i32(<2 x i32> %val) + ret void +} + +; FIXME: Passing 4th +; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: +; HSA-DAG: s_mov_b32 s33, s9 +; MESA-DAG: s_mov_b32 s33, s3{{$}} + +; GCN-DAG: v_mov_b32_e32 v0 +; GCN-DAG: v_mov_b32_e32 v1 +; GCN-DAG: v_mov_b32_e32 v2 +; GCN-DAG: v_mov_b32_e32 v3 + +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { + call void @external_void_func_v3i32(<3 x i32> ) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v4i32: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { + %val = load <4 x i32>, <4 x i32> addrspace(1)* undef + call void @external_void_func_v4i32(<4 x i32> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { + %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef + %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr + call void @external_void_func_v8i32(<8 x i32> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v16i32: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { + %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef + %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr + call void @external_void_func_v16i32(<16 x i32> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v32i32: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dwordx4 v[16:19], off +; GCN-DAG: buffer_load_dwordx4 v[20:23], off +; GCN-DAG: buffer_load_dwordx4 v[24:27], off +; GCN-DAG: buffer_load_dwordx4 v[28:31], off +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { + %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr + call void @external_void_func_v32i32(<32 x i32> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32: +; HSA-DAG: s_mov_b32 s33, s9 +; HSA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}} + +; MESA-DAG: s_mov_b32 s33, s3{{$}} +; MESA-DAG: s_add_u32 [[SP_REG:s[0-9]+]], s33, 0x100{{$}} + +; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dwordx4 v[16:19], off +; GCN-DAG: buffer_load_dwordx4 v[20:23], off +; GCN-DAG: buffer_load_dwordx4 v[24:27], off +; GCN-DAG: buffer_load_dwordx4 v[28:31], off + +; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SP_REG]]{{$}} +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { + %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 + %val1 = load i32, i32 addrspace(1)* undef + call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1) + ret void +} + +; FIXME: No wait after call +; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm: +; GCN: v_mov_b32_e32 v0, 42 +; GCN: s_swappc_b64 s[30:31], +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0 +define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 { + %val = call i32 @external_i32_func_i32(i32 42) + store volatile i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32: +; GCN: buffer_load_ubyte v0, off +; GCN: buffer_load_dword v1, off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { + %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef + %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0 + call void @external_void_func_struct_i8_i32({ i8, i32 } %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32: +; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}} + +; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 +; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 +; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8 +; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12 + +; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8 +; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12 + +; GCN: s_add_u32 [[SP]], [[SP]], 0x200 + +; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 +; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 + +; HSA: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 +; HSA: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} + + +; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 +; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 + +; MESA: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4 +; MESA: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}} + +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_sub_u32 [[SP]], [[SP]], 0x200 +define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 { + %val = alloca { i8, i32 }, align 4 + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 1 + store i8 3, i8* %gep0 + store i32 8, i32* %gep1 + call void @external_void_func_byval_struct_i8_i32({ i8, i32 }* %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: +; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}} +; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x600{{$}} + +; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 +; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 +; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 +; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 + +; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 +; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 + +; GCN-DAG: s_add_u32 [[SP]], [[SP]], 0x200 +; GCN: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 +; GCN: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}} +; GCN-NEXT: s_swappc_b64 +; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 +; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 +; GCN: s_sub_u32 [[SP]], [[SP]], 0x200 + +; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off +; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off +define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { + %in.val = alloca { i8, i32 }, align 4 + %out.val = alloca { i8, i32 }, align 4 + %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 0 + %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 1 + store i8 3, i8* %in.gep0 + store i32 8, i32* %in.gep1 + call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* %out.val, { i8, i32 }* %in.val) + %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 0 + %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 1 + %out.val0 = load i8, i8* %out.gep0 + %out.val1 = load i32, i32* %out.gep1 + + store volatile i8 %out.val0, i8 addrspace(1)* undef + store volatile i32 %out.val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v16i8: +define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { + %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef + %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + call void @external_void_func_v16i8(<16 x i8> %val) + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind noinline } Index: test/CodeGen/AMDGPU/call-preserved-registers.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -0,0 +1,251 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare void @external_void_func_void() #0 + +; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; GCN: s_mov_b32 s33, s7 +; GCN: s_getpc_b64 s[34:35] +; GCN-NEXT: s_add_u32 s34, s34, +; GCN-NEXT: s_addc_u32 s35, s35, +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 s[30:31], s[34:35] + +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: #ASMSTART +; GCN-NEXT: #ASMEND +; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35] +define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { + call void @external_void_func_void() + call void asm sideeffect "", ""() #0 + call void @external_void_func_void() + ret void +} + +; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: +; GCN: v_writelane_b32 v32, s33, 0 +; GCN: v_writelane_b32 v32, s34, 1 +; GCN: v_writelane_b32 v32, s35, 2 +; GCN: v_writelane_b32 v32, s36, 3 +; GCN: v_writelane_b32 v32, s37, 4 + +; GCN: s_mov_b32 s33, s5 +; GCN: s_swappc_b64 +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_swappc_b64 +; GCN: s_mov_b32 s5, s33 +; GCN: v_readlane_b32 s37, v32, 4 +; GCN: v_readlane_b32 s36, v32, 3 +; GCN: v_readlane_b32 s35, v32, 2 +; GCN: v_readlane_b32 s34, v32, 1 +; GCN: v_readlane_b32 s33, v32, 0 +; GCN: s_setpc_b64 +define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { + call void @external_void_func_void() + call void asm sideeffect "", ""() #0 + call void @external_void_func_void() + ret void +} + +; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31: +; GCN: s_waitcnt +; GCN-NEXT: s_mov_b64 [[SAVEPC:s\[[0-9]+:[0-9]+\]]], s[30:31] +; GCN-NEXT: #ASMSTART +; GCN: ; clobber +; GCN-NEXT: #ASMEND +; GCN-NEXT: s_mov_b64 s[30:31], [[SAVEPC]] +; GCN-NEXT: s_setpc_b64 s[30:31] +define void @void_func_void_clobber_s30_s31() #2 { + call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 + ret void +} + +; GCN-LABEL: {{^}}void_func_void_clobber_vcc: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_setpc_b64 s[30:31] +define void @void_func_void_clobber_vcc() #2 { + call void asm sideeffect "", "~{VCC}"() #0 + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: +; GCN: s_getpc_b64 +; GCN-NEXT: s_add_u32 +; GCN-NEXT: s_addc_u32 +; GCN: s_mov_b64 s[34:35], vcc +; GCN-NEXT: s_swappc_b64 +; GCN: s_mov_b64 vcc, s[34:35] +define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1)* %out) #0 { + %vcc = call i64 asm sideeffect "; def $0", "={vcc}"() + call void @void_func_void_clobber_vcc() + %val0 = load volatile i32, i32 addrspace(1)* undef + %val1 = load volatile i32, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "{vcc}"(i64 %vcc) + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: +; GCN: s_mov_b32 s33, s31 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: s_mov_b32 s31, s33 +define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 { + %s31 = call i32 asm sideeffect "; def $0", "={s31}"() + call void @external_void_func_void() + call void asm sideeffect "; use $0", "{s31}"(i32 %s31) + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31: +; GCN: v_mov_b32_e32 v32, v31 +; GCN-NEXT: s_swappc_b64 +; GCN-NEXT: v_mov_b32_e32 v31, v32 +define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 { + %v31 = call i32 asm sideeffect "; def $0", "={v31}"() + call void @external_void_func_void() + call void asm sideeffect "; use $0", "{v31}"(i32 %v31) + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: +; GCN: s_mov_b32 s34, s9 +; GCN: ; def s33 +; GCN-NEXT: #ASMEND +; GCN: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 +; GCN-NEXT: s_mov_b32 s4, s34 +; GCN-NEXT: s_mov_b32 s32, s34 +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use s33 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(1)* %out) #0 { + %s33 = call i32 asm sideeffect "; def $0", "={s33}"() + call void @external_void_func_void() + call void asm sideeffect "; use $0", "{s33}"(i32 %s33) + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: +; GCN: s_mov_b32 s33, s9 +; GCN: ; def v32 +; GCN-NEXT: #ASMEND +; GCN: s_getpc_b64 s[6:7] +; GCN-NEXT: s_add_u32 s6, s6, external_void_func_void@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s7, s7, external_void_func_void@rel32@hi+4 +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GCN-NEXT: ;;#ASMSTART +; GCN-NEXT: ; use v32 +; GCN-NEXT: ;;#ASMEND +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 { + %v32 = call i32 asm sideeffect "; def $0", "={v32}"() + call void @external_void_func_void() + call void asm sideeffect "; use $0", "{v32}"(i32 %v32) + ret void +} + +; GCN-LABEL: {{^}}void_func_void_clobber_s33: +; GCN: v_writelane_b32 v0, s33, 0 +; GCN-NEXT: #ASMSTART +; GCN-NEXT: ; clobber +; GCN-NEXT: #ASMEND +; GCN-NEXT: v_readlane_b32 s33, v0, 0 +; GCN-NEXT: s_setpc_b64 +define void @void_func_void_clobber_s33() #2 { + call void asm sideeffect "; clobber", "~{s33}"() #0 + ret void +} + +; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: +; GCN: s_mov_b32 s33, s7 +; GCN: s_getpc_b64 +; GCN-NEXT: s_add_u32 +; GCN-NEXT: s_addc_u32 +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { + call void @void_func_void_clobber_s33() + ret void +} + +; GCN-LABEL: {{^}}callee_saved_sgpr_func: +; GCN-NOT: s40 +; GCN: v_writelane_b32 v32, s40 +; GCN: s_swappc_b64 +; GCN-NOT: s40 +; GCN: ; use s40 +; GCN-NOT: s40 +; GCN: v_readlane_b32 s40, v32 +; GCN-NOT: s40 +define void @callee_saved_sgpr_func() #2 { + %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 + call void @external_void_func_void() + call void asm sideeffect "; use $0", "s"(i32 %s40) #0 + ret void +} + +; GCN-LABEL: {{^}}callee_saved_sgpr_kernel: +; GCN-NOT: s40 +; GCN: ; def s40 +; GCN-NOT: s40 +; GCN: s_swappc_b64 +; GCN-NOT: s40 +; GCN: ; use s40 +; GCN-NOT: s40 +define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 { + %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 + call void @external_void_func_void() + call void asm sideeffect "; use $0", "s"(i32 %s40) #0 + ret void +} + +; First call preserved VGPR is used so it can't be used for SGPR spills. +; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func: +; GCN-NOT: s40 +; GCN: v_writelane_b32 v33, s40 +; GCN: s_swappc_b64 +; GCN-NOT: s40 +; GCN: ; use s40 +; GCN-NOT: s40 +; GCN: v_readlane_b32 s40, v33 +; GCN-NOT: s40 +define void @callee_saved_sgpr_vgpr_func() #2 { + %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 + %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 + call void @external_void_func_void() + call void asm sideeffect "; use $0", "s"(i32 %s40) #0 + call void asm sideeffect "; use $0", "v"(i32 %v32) #0 + ret void +} + +; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel: +; GCN-NOT: s40 +; GCN: ; def s40 +; GCN-NOT: s40 +; GCN: s_swappc_b64 +; GCN-NOT: s40 +; GCN: ; use s40 +; GCN-NOT: s40 +define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 { + %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0 + %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0 + call void @external_void_func_void() + call void asm sideeffect "; use $0", "s"(i32 %s40) #0 + call void asm sideeffect "; use $0", "v"(i32 %v32) #0 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind noinline } Index: test/CodeGen/AMDGPU/call-return-types.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/call-return-types.ll @@ -0,0 +1,241 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare void @external_void_func_void() #0 + +declare i1 @external_i1_func_void() #0 +declare zeroext i1 @external_i1_zeroext_func_void() #0 +declare signext i1 @external_i1_signext_func_void() #0 + +declare i8 @external_i8_func_void() #0 +declare zeroext i8 @external_i8_zeroext_func_void() #0 +declare signext i8 @external_i8_signext_func_void() #0 + +declare i16 @external_i16_func_void() #0 +declare zeroext i16 @external_i16_zeroext_func_void() #0 +declare signext i16 @external_i16_signext_func_void() #0 + +declare i32 @external_i32_func_void() #0 +declare i64 @external_i64_func_void() #0 +declare half @external_f16_func_void() #0 +declare float @external_f32_func_void() #0 +declare double @external_f64_func_void() #0 + +declare <2 x i32> @external_v2i32_func_void() #0 +declare <3 x i32> @external_v3i32_func_void() #0 +declare <4 x i32> @external_v4i32_func_void() #0 +declare <5 x i32> @external_v5i32_func_void() #0 +declare <8 x i32> @external_v8i32_func_void() #0 +declare <16 x i32> @external_v16i32_func_void() #0 +declare <32 x i32> @external_v32i32_func_void() #0 +declare { <32 x i32>, i32 } @external_v32i32_i32_func_void() #0 +declare <2 x i16> @external_v2i16_func_void() #0 +declare <2 x half> @external_v2f16_func_void() #0 + +declare { i32, i64 } @external_i32_i64_func_void() #0 + +; GCN-LABEL: {{^}}test_call_external_void_func_void: +define amdgpu_kernel void @test_call_external_void_func_void() #0 { + call void @external_void_func_void() + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_void_x2: +define amdgpu_kernel void @test_call_external_void_func_void_x2() #0 { + call void @external_void_func_void() + call void @external_void_func_void() + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i1_func_void: +define amdgpu_kernel void @test_call_external_i1_func_void() #0 { + %val = call i1 @external_i1_func_void() + store volatile i1 %val, i1 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i1_zeroext_func_void: +define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 { + %val = call i1 @external_i1_zeroext_func_void() + %val.ext = zext i1 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i1_signext_func_void: +define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 { + %val = call i1 @external_i1_signext_func_void() + %val.ext = zext i1 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i8_func_void: +define amdgpu_kernel void @test_call_external_i8_func_void() #0 { + %val = call i8 @external_i8_func_void() + store volatile i8 %val, i8 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i8_zeroext_func_void: +define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 { + %val = call i8 @external_i8_zeroext_func_void() + %val.ext = zext i8 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i8_signext_func_void: +define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 { + %val = call i8 @external_i8_signext_func_void() + %val.ext = zext i8 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i16_func_void: +define amdgpu_kernel void @test_call_external_i16_func_void() #0 { + %val = call i16 @external_i16_func_void() + store volatile i16 %val, i16 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i16_zeroext_func_void: +define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 { + %val = call i16 @external_i16_zeroext_func_void() + %val.ext = zext i16 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i16_signext_func_void: +define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 { + %val = call i16 @external_i16_signext_func_void() + %val.ext = zext i16 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i32_func_void: +define amdgpu_kernel void @test_call_external_i32_func_void() #0 { + %val = call i32 @external_i32_func_void() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i64_func_void: +define amdgpu_kernel void @test_call_external_i64_func_void() #0 { + %val = call i64 @external_i64_func_void() + store volatile i64 %val, i64 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_f16_func_void: +define amdgpu_kernel void @test_call_external_f16_func_void() #0 { + %val = call half @external_f16_func_void() + store volatile half %val, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_f32_func_void: +define amdgpu_kernel void @test_call_external_f32_func_void() #0 { + %val = call float @external_f32_func_void() + store volatile float %val, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_f64_func_void: +define amdgpu_kernel void @test_call_external_f64_func_void() #0 { + %val = call double @external_f64_func_void() + store volatile double %val, double addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v2i32_func_void: +define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 { + %val = call <2 x i32> @external_v2i32_func_void() + store volatile <2 x i32> %val, <2 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v3i32_func_void: +define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { + %val = call <3 x i32> @external_v3i32_func_void() + store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v4i32_func_void: +define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 { + %val = call <4 x i32> @external_v4i32_func_void() + store volatile <4 x i32> %val, <4 x i32> addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v5i32_func_void: +define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { + %val = call <5 x i32> @external_v5i32_func_void() + store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v8i32_func_void: +define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 { + %val = call <8 x i32> @external_v8i32_func_void() + store volatile <8 x i32> %val, <8 x i32> addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v16i32_func_void: +define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 { + %val = call <16 x i32> @external_v16i32_func_void() + store volatile <16 x i32> %val, <16 x i32> addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v32i32_func_void: +define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 { + %val = call <32 x i32> @external_v32i32_func_void() + store volatile <32 x i32> %val, <32 x i32> addrspace(1)* undef, align 8 + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v2i16_func_void: +define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 { + %val = call <2 x i16> @external_v2i16_func_void() + store volatile <2 x i16> %val, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_v2f16_func_void: +define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 { + %val = call <2 x half> @external_v2f16_func_void() + store volatile <2 x half> %val, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}test_call_external_i32_i64_func_void: +define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 { + %val = call { i32, i64 } @external_i32_i64_func_void() + %val.0 = extractvalue { i32, i64 } %val, 0 + %val.1 = extractvalue { i32, i64 } %val, 1 + store volatile i32 %val.0, i32 addrspace(1)* undef + store volatile i64 %val.1, i64 addrspace(1)* undef + ret void +} + +; Requires writing results to stack +; GCN-LABEL: {{^}}test_call_external_v32i32_i32_func_void: +define amdgpu_kernel void @test_call_external_v32i32_i32_func_void() #0 { + %val = call { <32 x i32>, i32 } @external_v32i32_i32_func_void() + %val0 = extractvalue { <32 x i32>, i32 } %val, 0 + %val1 = extractvalue { <32 x i32>, i32 } %val, 1 + store volatile <32 x i32> %val0, <32 x i32> addrspace(1)* undef, align 8 + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind noinline } Index: test/CodeGen/AMDGPU/callee-frame-setup.ll =================================================================== --- test/CodeGen/AMDGPU/callee-frame-setup.ll +++ test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -amdgpu-function-calls -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}callee_no_stack: ; GCN: ; BB#0: @@ -8,6 +9,14 @@ ret void } +; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim: +; GCN: ; BB#0: +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @callee_no_stack_no_fp_elim() #1 { + ret void +} + ; Requires frame pointer for access to local regular object. ; GCN-LABEL: {{^}}callee_with_stack: @@ -24,4 +33,51 @@ ret void } +; GCN-LABEL: {{^}}callee_with_stack_and_call: +; GCN: ; BB#0: +; GCN-NEXT: s_waitcnt + +; GCN-DAG: s_mov_b32 s5, s32 +; GCN-DAG: v_writelane_b32 v32, s33, +; GCN-DAG: v_writelane_b32 v32, s34, +; GCN-DAG: v_writelane_b32 v32, s35, +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: s_add_u32 s32, s32, 0x200{{$}} +; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} +; GCN-DAG: s_mov_b32 s33, s5 + + +; GCN: s_swappc_b64 +; GCN: s_mov_b32 s5, s33 +; GCN-DAG: v_readlane_b32 s35, +; GCN-DAG: v_readlane_b32 s34, +; GCN-DAG: v_readlane_b32 s33, +; GCN: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @callee_with_stack_and_call() #0 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + call void @external_void_func_void() + ret void +} + +; Should be able to copy incoming stack pointer directly to inner +; call's stack pointer argument. + +; GCN-LABEL: {{^}}callee_no_stack_with_call: +; GCN: s_waitcnt +; GCN-NOT: s32 +; GCN: s_mov_b32 s33, s5 +; GCN: s_swappc_b64 +; GCN: s_mov_b32 s5, s33 +; GCN-NOT: s32 +; GCN: s_setpc_b64 +define void @callee_no_stack_with_call() #0 { + call void @external_void_func_void() + ret void +} + +declare void @external_void_func_void() #0 + attributes #0 = { nounwind } +attributes #1 = { nounwind "no-frame-pointer-elim"="true" } Index: test/CodeGen/AMDGPU/nested-calls.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/nested-calls.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-function-calls -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -amdgpu-function-calls -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VI %s + +; Test calls when called by other callable functions rather than +; kernels. + +declare void @external_void_func_i32(i32) #0 + +; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm: +; GCN: s_waitcnt +; GCN-NOT: s32 +; GCN: s_swappc_b64 +; GCN-NOT: s32 +; GCN: s_setpc_b64 +define void @test_func_call_external_void_func_i32_imm() #0 { + call void @external_void_func_i32(i32 42) + ret void +} + +; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: +; GCN: s_waitcnt +; GCN: s_mov_b32 s5, s32 +; GCN: s_add_u32 s32, s32, 0x1100{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset +; GCN: s_swappc_b64 +; GCN: s_sub_u32 s32, s32, 0x1100{{$}} +; GCN: s_setpc_b64 +define void @test_func_call_external_void_func_i32_imm_stack_use() #0 { + %alloca = alloca [16 x i32], align 4 + %gep0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 0 + %gep15 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 16 + store volatile i32 0, i32* %gep0 + store volatile i32 0, i32* %gep15 + call void @external_void_func_i32(i32 42) + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind noinline }