Index: lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -45,7 +45,7 @@ unsigned Offset) const { MachineFunction &MF = MIRBuilder.getMF(); - const SIRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = *MF.getFunction(); const DataLayout &DL = F.getParent()->getDataLayout(); @@ -53,7 +53,7 @@ LLT PtrType = getLLTForType(*PtrTy, DL); unsigned DstReg = MRI.createGenericVirtualRegister(PtrType); unsigned KernArgSegmentPtr = - TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(MF, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -24,7 +24,7 @@ class AMDGPUMachineFunction; class AMDGPUSubtarget; -class MachineRegisterInfo; +struct ArgDescriptor; class AMDGPUTargetLowering : public TargetLowering { private: @@ -241,6 +241,25 @@ return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); } + /// Similar to CreateLiveInRegister, except value maybe loaded from a stack + /// slot rather than passed in a register. + SDValue loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const; + + SDValue storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const; + + SDValue loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const; + enum ImplicitParameter { FIRST_IMPLICIT, GRID_DIM = FIRST_IMPLICIT, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3586,6 +3586,49 @@ return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT); } +SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, + EVT VT, + const SDLoc &SL, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); + + int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true); + auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); + SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); + + return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); +} + +SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + SDValue StackPtr, + SDValue ArgVal, + int64_t Offset) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); + SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32); + SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset); + + SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, + MachineMemOperand::MODereferenceable); + return Store; +} + +SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG, + const TargetRegisterClass *RC, + EVT VT, const SDLoc &SL, + const ArgDescriptor &Arg) const { + assert(Arg && "Attempting to load missing argument"); + + if (Arg.isRegister()) + return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL); + return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset()); +} + uint32_t AMDGPUTargetLowering::getImplicitParameterOffset( const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const { unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr(); Index: lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIFrameLowering.cpp +++ lib/Target/AMDGPU/SIFrameLowering.cpp @@ -38,6 +38,7 @@ MachineBasicBlock &MBB) const { const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); // We don't need this if we only have spills since there is no user facing // scratch. @@ -55,7 +56,7 @@ MachineBasicBlock::iterator I = MBB.begin(); unsigned FlatScratchInitReg - = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT); + = MFI->getPreloadedReg(MF, AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.addLiveIn(FlatScratchInitReg); @@ -64,7 +65,6 @@ unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - const SIMachineFunctionInfo *MFI = MF.getInfo(); unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); // Do a 64-bit pointer add. @@ -282,13 +282,13 @@ } // We need to insert initialization of the scratch resource descriptor. - unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; if (ST.isAmdCodeObjectV2(MF)) { - PreloadedPrivateBufferReg = TRI->getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + PreloadedPrivateBufferReg = MFI->getPreloadedReg( + MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); } bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIISELLOWERING_H #include "AMDGPUISelLowering.h" +#include "AMDGPUArgumentUsageInfo.h" #include "SIInstrInfo.h" namespace llvm { @@ -31,6 +32,10 @@ SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA, const SDLoc &SL, SDValue Chain, const ISD::InputArg &Arg) const; + SDValue getPreloadedValue(SelectionDAG &DAG, + const SIMachineFunctionInfo &MFI, + EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue) const; SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; @@ -207,6 +212,14 @@ const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; + void passSpecialInputs( + CallLoweringInfo &CLI, + const SIMachineFunctionInfo &Info, + SmallVectorImpl> &RegsToPass, + SmallVectorImpl &MemOpChains, + SDValue Chain, + SDValue StackPtr) const; + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -45,6 +45,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" @@ -888,14 +889,19 @@ uint64_t Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, - SIRegisterInfo::KERNARG_SEGMENT_PTR); + const SIMachineFunctionInfo *Info = MF.getInfo(); + + const ArgDescriptor *InputPtrReg; + const TargetRegisterClass *RC; + + std::tie(InputPtrReg, RC) = Info->getPreloadedValue(MF, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, - MRI.getLiveInVirtReg(InputPtrReg), PtrVT); + MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); + return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, DAG.getConstant(Offset, SL, PtrVT)); } @@ -991,6 +997,19 @@ return ArgValue; } +SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, + const SIMachineFunctionInfo &MFI, + EVT VT, + AMDGPUFunctionArgInfo::PreloadedValue PVID) const { + const ArgDescriptor *Reg; + const TargetRegisterClass *RC; + + MachineFunction &MF = DAG.getMachineFunction(); + std::tie(Reg, RC) = MFI.getPreloadedValue(MF, PVID); + + return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); +} + static void processShaderInputArgs(SmallVectorImpl &Splits, CallingConv::ID CallConv, ArrayRef Ins, @@ -1041,29 +1060,131 @@ } // Allocate special inputs passed in VGPRs. -static void allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { if (Info.hasWorkItemIDX()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + unsigned Reg = AMDGPU::VGPR0; MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + assert(Reg == AMDGPU::VGPR0); + CCInfo.AllocateReg(Reg); + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDY()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + unsigned Reg = AMDGPU::VGPR1; MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + + assert(Reg == AMDGPU::VGPR1); CCInfo.AllocateReg(Reg); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDZ()) { - unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + unsigned Reg = AMDGPU::VGPR2; MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + + assert(Reg == AMDGPU::VGPR2); CCInfo.AllocateReg(Reg); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); } } +// Try to allocate a VGPR at the end of the argument list, or if no argument +// VGPRs are left allocating a stack slot. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { + ArrayRef ArgVGPRs + = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); + unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); + if (RegIdx == ArgVGPRs.size()) { + // Spill to stack required. + int64_t Offset = CCInfo.AllocateStack(4, 4); + + return ArgDescriptor::createStack(Offset); + } + + unsigned Reg = ArgVGPRs[RegIdx]; + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + return ArgDescriptor::createRegister(Reg); +} + +static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, + const TargetRegisterClass *RC, + unsigned NumArgRegs) { + ArrayRef ArgSGPRs = makeArrayRef(RC->begin(), 32); + unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs); + if (RegIdx == ArgSGPRs.size()) + report_fatal_error("ran out of SGPRs for arguments"); + + unsigned Reg = ArgSGPRs[RegIdx]; + Reg = CCInfo.AllocateReg(Reg); + assert(Reg != AMDGPU::NoRegister); + + MachineFunction &MF = CCInfo.getMachineFunction(); + MF.addLiveIn(Reg, RC); + return ArgDescriptor::createRegister(Reg); +} + +static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) { + return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32); +} + +static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { + return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); +} + +static void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasWorkItemIDX()) + Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + + if (Info.hasWorkItemIDY()) + Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + + if (Info.hasWorkItemIDZ()) + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); +} + +static void allocateSpecialInputSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + auto &ArgInfo = Info.getArgInfo(); + + // TODO: Unify handling with private memory pointers. + + if (Info.hasDispatchPtr()) + ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo); + + if (Info.hasQueuePtr()) + ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); + + if (Info.hasKernargSegmentPtr()) + ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); + + if (Info.hasDispatchID()) + ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); + + // flat_scratch_init is not applicable for non-kernel functions. + + if (Info.hasWorkGroupIDX()) + ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo); + + if (Info.hasWorkGroupIDY()) + ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo); + + if (Info.hasWorkGroupIDZ()) + ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); +} + // Allocate special inputs passed in user SGPRs. static void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, @@ -1198,8 +1319,8 @@ // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( + MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); if (MFI.hasCalls()) { @@ -1215,8 +1336,8 @@ = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); } else { - unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( + MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); } } else { @@ -1241,9 +1362,9 @@ // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); - if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( - MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + if (HasStackObjects && !MFI.hasCalls()) { + unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( + MF, AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { unsigned ReservedOffsetReg @@ -1376,7 +1497,7 @@ } if (IsEntryFunc) { - allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); } @@ -1468,6 +1589,11 @@ InVals.push_back(Val); } + if (!IsEntryFunc) { + // Special inputs come after user arguments. + allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + } + // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); @@ -1475,8 +1601,13 @@ CCInfo.AllocateReg(Info->getScratchRSrcReg()); CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); CCInfo.AllocateReg(Info->getFrameOffsetReg()); + allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis(); + ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo()); + return Chains.empty() ? Chain : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } @@ -1701,6 +1832,78 @@ return Chain; } +// Add code to pass special inputs required depending on used features separate +// from the explicit user arguments present in the IR. +void SITargetLowering::passSpecialInputs( + CallLoweringInfo &CLI, + const SIMachineFunctionInfo &Info, + SmallVectorImpl> &RegsToPass, + SmallVectorImpl &MemOpChains, + SDValue Chain, + SDValue StackPtr) const { + const Function *CalleeFunc = CLI.CS->getCalledFunction(); + if (!CalleeFunc) + report_fatal_error("indirect calls not handled"); + + SelectionDAG &DAG = CLI.DAG; + MachineFunction &MF = DAG.getMachineFunction(); + const SDLoc &DL = CLI.DL; + + const SISubtarget *ST = getSubtarget(); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis(); + const AMDGPUFunctionArgInfo &CalleeArgInfo + = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + + const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); + + // TODO: Unify with private memory register handling. This is complicated by + // the fact that at least in kernels, the input argument is not necessarily + // in the same location as the input. + AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { + AMDGPUFunctionArgInfo::DISPATCH_PTR, + AMDGPUFunctionArgInfo::QUEUE_PTR, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::DISPATCH_ID, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, + AMDGPUFunctionArgInfo::WORKITEM_ID_X, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z + }; + + for (auto InputID : InputRegs) { + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) + = CalleeArgInfo.getPreloadedValue(MF, InputID); + if (!OutgoingArg) + continue; + + const ArgDescriptor *IncomingArg; + const TargetRegisterClass *IncomingArgRC; + std::tie(IncomingArg, IncomingArgRC) + = CallerArgInfo.getPreloadedValue(MF, InputID); + assert(IncomingArgRC == ArgRC); + + // All special arguments are ints for now. + EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; + SDValue InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr, + InputReg, + OutgoingArg->getStackOffset()); + MemOpChains.push_back(ArgStore); + } + } +} + // The wave scratch offset register is used as the global base pointer. SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { @@ -1889,6 +2092,9 @@ } } + // Copy special input registers after user input arguments. + passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr); + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -3414,7 +3620,6 @@ SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); auto MFI = MF.getInfo(); - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); EVT VT = Op.getValueType(); SDLoc DL(Op); @@ -3426,10 +3631,8 @@ case Intrinsic::amdgcn_implicit_buffer_ptr: { if (getSubtarget()->isAmdCodeObjectV2(MF)) return emitNonHSAIntrinsicError(DAG, DL, VT); - - unsigned Reg = TRI->getPreloadedValue(MF, - SIRegisterInfo::IMPLICIT_BUFFER_PTR); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); } case Intrinsic::amdgcn_dispatch_ptr: case Intrinsic::amdgcn_queue_ptr: { @@ -3441,23 +3644,20 @@ return DAG.getUNDEF(VT); } - auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? - SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR; - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, - TRI->getPreloadedValue(MF, Reg), VT); + auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ? + AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR; + return getPreloadedValue(DAG, *MFI, VT, RegID); } case Intrinsic::amdgcn_implicitarg_ptr: { unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset); } case Intrinsic::amdgcn_kernarg_segment_ptr: { - unsigned Reg - = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); } case Intrinsic::amdgcn_dispatch_id: { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_ID); - return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT); + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID); } case Intrinsic::amdgcn_rcp: return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); @@ -3542,28 +3742,32 @@ SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::amdgcn_workgroup_id_x: case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_X); case Intrinsic::amdgcn_workgroup_id_y: case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegister(DAG, &AMDGPU::SReg_32_XM0RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); - case Intrinsic::amdgcn_workitem_id_x: + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_workitem_id_x: { case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDX); + } case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); + return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, + SDLoc(DAG.getEntryNode()), + MFI->getArgInfo().WorkItemIDZ); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -904,7 +904,6 @@ MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); const SISubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -924,13 +923,13 @@ WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + = MFI->getPreloadedReg(*MF, AMDGPUFunctionArgInfo::WORKGROUP_ID_X); unsigned TIDIGYReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + = MFI->getPreloadedReg(*MF, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); unsigned TIDIGZReg - = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); + = MFI->getPreloadedReg(*MF, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + MFI->getPreloadedReg(*MF, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "AMDGPUMachineFunction.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "AMDGPUArgumentUsageInfo.h" #include "SIRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/MC/MCRegisterInfo.h" @@ -96,33 +97,7 @@ // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. unsigned StackPtrOffsetReg; - // Input registers for non-HSA ABI - unsigned ImplicitBufferPtrUserSGPR; - - // Input registers setup for the HSA ABI. - // User SGPRs in allocation order. - unsigned PrivateSegmentBufferUserSGPR; - unsigned DispatchPtrUserSGPR; - unsigned QueuePtrUserSGPR; - unsigned KernargSegmentPtrUserSGPR; - unsigned DispatchIDUserSGPR; - unsigned FlatScratchInitUserSGPR; - unsigned PrivateSegmentSizeUserSGPR; - unsigned GridWorkGroupCountXUserSGPR; - unsigned GridWorkGroupCountYUserSGPR; - unsigned GridWorkGroupCountZUserSGPR; - - // System SGPRs in allocation order. - unsigned WorkGroupIDXSystemSGPR; - unsigned WorkGroupIDYSystemSGPR; - unsigned WorkGroupIDZSystemSGPR; - unsigned WorkGroupInfoSystemSGPR; - unsigned PrivateSegmentWaveByteOffsetSystemSGPR; - - // VGPR inputs. These are always v0, v1 and v2 for entry functions. - unsigned WorkItemIDXVGPR; - unsigned WorkItemIDYVGPR; - unsigned WorkItemIDZVGPR; + AMDGPUFunctionArgInfo ArgInfo; // Graphics info. unsigned PSInputAddr; @@ -218,7 +193,6 @@ SmallVector SpillVGPRs; public: - SIMachineFunctionInfo(const MachineFunction &MF); ArrayRef getSGPRToVGPRSpills(int FrameIndex) const { @@ -245,37 +219,52 @@ // Add system SGPRs. unsigned addWorkGroupIDX() { - WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDXSystemSGPR; + return ArgInfo.WorkGroupIDX.getRegister(); } unsigned addWorkGroupIDY() { - WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDYSystemSGPR; + return ArgInfo.WorkGroupIDY.getRegister(); } unsigned addWorkGroupIDZ() { - WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupIDZSystemSGPR; + return ArgInfo.WorkGroupIDZ.getRegister(); } unsigned addWorkGroupInfo() { - WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return WorkGroupInfoSystemSGPR; + return ArgInfo.WorkGroupInfo.getRegister(); + } + + // Add special VGPR inputs + void setWorkItemIDX(ArgDescriptor Arg) { + ArgInfo.WorkItemIDX = Arg; + } + + void setWorkItemIDY(ArgDescriptor Arg) { + ArgInfo.WorkItemIDY = Arg; + } + + void setWorkItemIDZ(ArgDescriptor Arg) { + ArgInfo.WorkItemIDZ = Arg; } + unsigned addPrivateSegmentWaveByteOffset() { - PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + ArgInfo.PrivateSegmentWaveByteOffset + = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; - return PrivateSegmentWaveByteOffsetSystemSGPR; + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } void setPrivateSegmentWaveByteOffset(unsigned Reg) { - PrivateSegmentWaveByteOffsetSystemSGPR = Reg; + ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } bool hasPrivateSegmentBuffer() const { @@ -350,6 +339,25 @@ return ImplicitBufferPtr; } + AMDGPUFunctionArgInfo &getArgInfo() { + return ArgInfo; + } + + const AMDGPUFunctionArgInfo &getArgInfo() const { + return ArgInfo; + } + + std::pair + getPreloadedValue(const MachineFunction &MF, + AMDGPUFunctionArgInfo::PreloadedValue Value) const { + return ArgInfo.getPreloadedValue(MF, Value); + } + + unsigned getPreloadedReg(const MachineFunction &MF, + AMDGPUFunctionArgInfo::PreloadedValue Value) const { + return ArgInfo.getPreloadedValue(MF, Value).first->getRegister(); + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } @@ -359,7 +367,7 @@ } unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { - return PrivateSegmentWaveByteOffsetSystemSGPR; + return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } /// \brief Returns the physical register reserved for use as the resource @@ -401,11 +409,11 @@ } unsigned getQueuePtrUserSGPR() const { - return QueuePtrUserSGPR; + return ArgInfo.QueuePtr.getRegister(); } unsigned getImplicitBufferPtrUserSGPR() const { - return ImplicitBufferPtrUserSGPR; + return ArgInfo.ImplicitBufferPtr.getRegister(); } bool hasSpilledSGPRs() const { @@ -537,13 +545,13 @@ switch (Dim) { case 0: assert(hasWorkGroupIDX()); - return WorkGroupIDXSystemSGPR; + return ArgInfo.WorkGroupIDX.getRegister(); case 1: assert(hasWorkGroupIDY()); - return WorkGroupIDYSystemSGPR; + return ArgInfo.WorkGroupIDY.getRegister(); case 2: assert(hasWorkGroupIDZ()); - return WorkGroupIDZSystemSGPR; + return ArgInfo.WorkGroupIDZ.getRegister(); } llvm_unreachable("unexpected dimension"); } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -27,24 +27,7 @@ ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG), FrameOffsetReg(AMDGPU::FP_REG), StackPtrOffsetReg(AMDGPU::SP_REG), - PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), - DispatchPtrUserSGPR(AMDGPU::NoRegister), - QueuePtrUserSGPR(AMDGPU::NoRegister), - KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), - DispatchIDUserSGPR(AMDGPU::NoRegister), - FlatScratchInitUserSGPR(AMDGPU::NoRegister), - PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), - GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), - WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), - WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), - WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), - WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), - PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), - WorkItemIDXVGPR(AMDGPU::NoRegister), - WorkItemIDYVGPR(AMDGPU::NoRegister), - WorkItemIDZVGPR(AMDGPU::NoRegister), + ArgInfo(), PSInputAddr(0), PSInputEnable(0), ReturnsVoid(true), @@ -91,8 +74,10 @@ FrameOffsetReg = AMDGPU::SGPR5; StackPtrOffsetReg = AMDGPU::SGPR32; - // FIXME: Not really a system SGPR. - PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(ScratchRSrcReg); + ArgInfo.PrivateSegmentWaveByteOffset = + ArgDescriptor::createRegister(ScratchWaveOffsetReg); } CallingConv::ID CC = F->getCallingConv(); @@ -145,10 +130,11 @@ if (HasStackObjects || MaySpill) { PrivateSegmentWaveByteOffset = true; - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + ArgInfo.PrivateSegmentWaveByteOffset + = ArgDescriptor::createRegister(AMDGPU::SGPR5); } } @@ -182,52 +168,54 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { - PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + ArgInfo.PrivateSegmentBuffer = + ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass)); NumUserSGPRs += 4; - return PrivateSegmentBufferUserSGPR; + return ArgInfo.PrivateSegmentBuffer.getRegister(); } unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { - DispatchPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return DispatchPtrUserSGPR; + return ArgInfo.DispatchPtr.getRegister(); } unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { - QueuePtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return QueuePtrUserSGPR; + return ArgInfo.QueuePtr.getRegister(); } unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { - KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.KernargSegmentPtr + = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return KernargSegmentPtrUserSGPR; + return ArgInfo.KernargSegmentPtr.getRegister(); } unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { - DispatchIDUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return DispatchIDUserSGPR; + return ArgInfo.DispatchID.getRegister(); } unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { - FlatScratchInitUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return FlatScratchInitUserSGPR; + return ArgInfo.FlatScratchInit.getRegister(); } unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { - ImplicitBufferPtrUserSGPR = TRI.getMatchingSuperReg( - getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; - return ImplicitBufferPtrUserSGPR; + return ArgInfo.ImplicitBufferPtr.getRegister(); } /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI. Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -186,31 +186,6 @@ OpType <= AMDGPU::OPERAND_SRC_LAST; } - enum PreloadedValue { - // SGPRS: - PRIVATE_SEGMENT_BUFFER = 0, - DISPATCH_PTR = 1, - QUEUE_PTR = 2, - KERNARG_SEGMENT_PTR = 3, - DISPATCH_ID = 4, - FLAT_SCRATCH_INIT = 5, - WORKGROUP_ID_X = 10, - WORKGROUP_ID_Y = 11, - WORKGROUP_ID_Z = 12, - PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, - IMPLICIT_BUFFER_PTR = 15, - - // VGPRS: - FIRST_VGPR_VALUE = 16, - WORKITEM_ID_X = FIRST_VGPR_VALUE, - WORKITEM_ID_Y = 17, - WORKITEM_ID_Z = 18 - }; - - /// \brief Returns the physical register that \p Value is stored in. - unsigned getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const; - unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF) const; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1341,61 +1341,6 @@ return getCommonSubClass(DefRC, SrcRC) != nullptr; } -// FIXME: Most of these are flexible with HSA and we don't need to reserve them -// as input registers if unused. Whether the dispatch ptr is necessary should be -// easy to detect from used intrinsics. Scratch setup is harder to know. -unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, - enum PreloadedValue Value) const { - - const SIMachineFunctionInfo *MFI = MF.getInfo(); - const SISubtarget &ST = MF.getSubtarget(); - (void)ST; - switch (Value) { - case SIRegisterInfo::WORKGROUP_ID_X: - assert(MFI->hasWorkGroupIDX()); - return MFI->WorkGroupIDXSystemSGPR; - case SIRegisterInfo::WORKGROUP_ID_Y: - assert(MFI->hasWorkGroupIDY()); - return MFI->WorkGroupIDYSystemSGPR; - case SIRegisterInfo::WORKGROUP_ID_Z: - assert(MFI->hasWorkGroupIDZ()); - return MFI->WorkGroupIDZSystemSGPR; - case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: - return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; - case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: - assert(MFI->hasPrivateSegmentBuffer()); - return MFI->PrivateSegmentBufferUserSGPR; - case SIRegisterInfo::IMPLICIT_BUFFER_PTR: - assert(MFI->hasImplicitBufferPtr()); - return MFI->ImplicitBufferPtrUserSGPR; - case SIRegisterInfo::KERNARG_SEGMENT_PTR: - assert(MFI->hasKernargSegmentPtr()); - return MFI->KernargSegmentPtrUserSGPR; - case SIRegisterInfo::DISPATCH_ID: - assert(MFI->hasDispatchID()); - return MFI->DispatchIDUserSGPR; - case SIRegisterInfo::FLAT_SCRATCH_INIT: - assert(MFI->hasFlatScratchInit()); - return MFI->FlatScratchInitUserSGPR; - case SIRegisterInfo::DISPATCH_PTR: - assert(MFI->hasDispatchPtr()); - return MFI->DispatchPtrUserSGPR; - case SIRegisterInfo::QUEUE_PTR: - assert(MFI->hasQueuePtr()); - return MFI->QueuePtrUserSGPR; - case SIRegisterInfo::WORKITEM_ID_X: - assert(MFI->hasWorkItemIDX()); - return AMDGPU::VGPR0; - case SIRegisterInfo::WORKITEM_ID_Y: - assert(MFI->hasWorkItemIDY()); - return AMDGPU::VGPR1; - case SIRegisterInfo::WORKITEM_ID_Z: - assert(MFI->hasWorkItemIDZ()); - return AMDGPU::VGPR2; - } - llvm_unreachable("unexpected preloaded value type"); -} - /// \brief Returns a register that is not used at any point in the function. /// If all registers are used, then this function will return // AMDGPU::NoRegister. Index: test/CodeGen/AMDGPU/callee-special-input-sgprs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -0,0 +1,529 @@ +; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s +; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +; GCN-LABEL: {{^}}use_dispatch_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +define void @use_dispatch_ptr() #1 { + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_dispatch_ptr: +; GCN: s_mov_b64 s[6:7], s[4:5] +define amdgpu_kernel void @kern_indirect_use_dispatch_ptr(i32) #1 { + call void @use_dispatch_ptr() + ret void +} + +; GCN-LABEL: {{^}}use_queue_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +define void @use_queue_ptr() #1 { + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr: +; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 { + call void @use_queue_ptr() + ret void +} + +; GCN-LABEL: {{^}}use_queue_ptr_addrspacecast: +; CIVI: s_load_dword [[APERTURE_LOAD:s[0-9]+]], s[6:7], 0x10 +; GFX9: s_getreg_b32 [[APERTURE_LOAD:s[0-9]+]] + +; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]] +; GCN: flat_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}} +define void @use_queue_ptr_addrspacecast() #1 { + %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)* + store volatile i32 0, i32 addrspace(4)* %asc + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr_addrspacecast: +; CIVI: s_mov_b64 s[6:7], s[4:5] +; GFX9-NOT: s_mov_b64 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_queue_ptr_addrspacecast(i32) #1 { + call void @use_queue_ptr_addrspacecast() + ret void +} + +; GCN-LABEL: {{^}}use_kernarg_segment_ptr: +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +define void @use_kernarg_segment_ptr() #1 { + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %header_ptr + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_kernarg_segment_ptr: +; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_kernarg_segment_ptr(i32) #1 { + call void @use_kernarg_segment_ptr() + ret void +} + +; GCN-LABEL: {{^}}use_dispatch_id: +; GCN: ; use s[6:7] +define void @use_dispatch_id() #1 { + %id = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %id) + ret void +} + +; No kernarg segment so that there is a mov to check. With kernarg +; pointer enabled, it happens to end up in the right place anyway. + +; GCN-LABEL: {{^}}kern_indirect_use_dispatch_id: +; GCN: s_mov_b64 s[6:7], s[4:5] +define amdgpu_kernel void @kern_indirect_use_dispatch_id() #1 { + call void @use_dispatch_id() + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_x: +; GCN: s_waitcnt +; GCN: ; use s6 +define void @use_workgroup_id_x() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_stack_workgroup_id_x: +; GCN: s_waitcnt +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4 +; GCN: ; use s6 +; GCN: s_setpc_b64 +define void @use_stack_workgroup_id_x() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %val = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_y: +; GCN: s_waitcnt +; GCN: ; use s6 +define void @use_workgroup_id_y() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_z: +; GCN: s_waitcnt +; GCN: ; use s6 +define void @use_workgroup_id_z() #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xy: +; GCN: ; use s6 +; GCN: ; use s7 +define void @use_workgroup_id_xy() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xyz: +; GCN: ; use s6 +; GCN: ; use s7 +; GCN: ; use s8 +define void @use_workgroup_id_xyz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.y() + %val2 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + call void asm sideeffect "; use $0", "s"(i32 %val2) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_xz: +; GCN: ; use s6 +; GCN: ; use s7 +define void @use_workgroup_id_xz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.x() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}use_workgroup_id_yz: +; GCN: ; use s6 +; GCN: ; use s7 +define void @use_workgroup_id_yz() #1 { + %val0 = call i32 @llvm.amdgcn.workgroup.id.y() + %val1 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val0) + call void asm sideeffect "; use $0", "s"(i32 %val1) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: +; GCN: s_mov_b32 s33, s5 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s6, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y: +; GCN: s_mov_b32 s33, s6 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s6, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z: +; GCN: s_mov_b32 s33, s6 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s6, s5 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy: +; GCN: s_mov_b32 s33, s6 +; GCN: s_mov_b32 s6, s4 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s7, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { + call void @use_workgroup_id_xy() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz: +; GCN: s_mov_b32 s33, s7 +; GCN: s_mov_b32 s8, s6 +; GCN: s_mov_b32 s6, s4 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s7, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { + call void @use_workgroup_id_xyz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz: +; GCN: s_mov_b32 s33, s6 +; GCN: s_mov_b32 s6, s4 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s7, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { + call void @use_workgroup_id_xz() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz: +; GCN: s_mov_b32 s33, s7 +; GCN: s_mov_b32 s7, s6 +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s6, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { + call void @use_workgroup_id_yz() + ret void +} + +; Argument is in right place already +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x: +; GCN-NOT: s6 +define void @func_indirect_use_workgroup_id_x() #1 { + call void @use_workgroup_id_x() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y: +; GCN-NOT: s6 +define void @func_indirect_use_workgroup_id_y() #1 { + call void @use_workgroup_id_y() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z: +; GCN-NOT: s6 +define void @func_indirect_use_workgroup_id_z() #1 { + call void @use_workgroup_id_z() + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x: +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s6 +define void @other_arg_use_workgroup_id_x(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.x() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y: +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s6 +define void @other_arg_use_workgroup_id_y(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.y() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z: +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN: ; use s6 +define void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workgroup.id.z() + store volatile i32 %arg0, i32 addrspace(1)* undef + call void asm sideeffect "; use $0", "s"(i32 %val) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_x: +; GCN-DAG: s_mov_b32 s33, s5 +; GCN-DAG: s_mov_b32 s5, s4 +; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s6, s5 +; GCN-DAG: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { + call void @other_arg_use_workgroup_id_x(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y: +; GCN-DAG: s_mov_b32 s33, s6 +; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s6, s5 +; GCN-DAG: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { + call void @other_arg_use_workgroup_id_y(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z: +; GCN: s_mov_b32 s33, s6 +; GCN-DAG: v_mov_b32_e32 v0, 0x22b +; GCN: s_mov_b32 s4, s33 +; GCN-DAG: s_mov_b32 s6, s5 + +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 { + call void @other_arg_use_workgroup_id_z(i32 555) + ret void +} + +; GCN-LABEL: {{^}}use_every_sgpr_input: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0 +; GCN: s_load_dword s{{[0-9]+}}, s[10:11], 0x0 +; GCN: ; use s[12:13] +; GCN: ; use s14 +; GCN: ; use s15 +; GCN: ; use s16 +define void @use_every_sgpr_input() #1 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input: +; GCN: s_mov_b32 s33, s15 +; GCN: s_mov_b32 s16, s14 +; GCN: s_mov_b32 s15, s13 +; GCN: s_mov_b32 s14, s12 +; GCN: s_mov_b64 s[12:13], s[10:11] +; GCN: s_mov_b64 s[10:11], s[8:9] +; GCN: s_mov_b64 s[8:9], s[6:7] +; GCN: s_mov_b64 s[6:7], s[4:5] +; GCN: s_mov_b32 s4, s33 +; GCN: s_mov_b32 s32, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { + call void @use_every_sgpr_input() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input: +; GCN-NOT: s6 +; GCN-NOT: s7 +; GCN-NOT: s8 +; GCN-NOT: s9 +; GCN-NOT: s10 +; GCN-NOT: s11 +; GCN-NOT: s12 +; GCN-NOT: s13 +; GCN-NOT: s[6:7] +; GCN-NOT: s[8:9] +; GCN-NOT: s[10:11] +; GCN-NOT: s[12:13] +define void @func_indirect_use_every_sgpr_input() #1 { + call void @use_every_sgpr_input() + ret void +} + +; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz: +; GCN-DAG: s_mov_b32 s6, s14 +; GCN-DAG: s_mov_b32 s7, s15 +; GCN-DAG: s_mov_b32 s8, s16 +; GCN: s_swappc_b64 +define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { + %alloca = alloca i32, align 4 + store volatile i32 0, i32* %alloca + + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + call void @use_workgroup_id_xyz() + ret void +} + +; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill: +; GCN: s_mov_b32 s5, s32 +; GCN: s_add_u32 s32, s32, 0x200 + +; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-9]+]], s14 +; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-9]+]], s15 +; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-9]+]], s16 +; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7] +; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9] +; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11] + +; GCN-DAG: s_mov_b32 s6, [[SAVE_X]] +; GCN-DAG: s_mov_b32 s7, [[SAVE_Y]] +; GCN-DAG: s_mov_b32 s8, [[SAVE_Z]] +; GCN: s_swappc_b64 + +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, +; GCN: s_load_dword s{{[0-9]+}}, +; GCN: s_load_dword s{{[0-9]+}}, +; GCN: ; use +; GCN: ; use [[SAVE_X]] +; GCN: ; use [[SAVE_Y]] +; GCN: ; use [[SAVE_Z]] +define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 { + %alloca = alloca i32, align 4 + call void @use_workgroup_id_xyz() + + store volatile i32 0, i32* %alloca + + %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)* + %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc + + %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 + %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)* + %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc + + %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 + %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)* + %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc + + %val3 = call i64 @llvm.amdgcn.dispatch.id() + call void asm sideeffect "; use $0", "s"(i64 %val3) + + %val4 = call i32 @llvm.amdgcn.workgroup.id.x() + call void asm sideeffect "; use $0", "s"(i32 %val4) + + %val5 = call i32 @llvm.amdgcn.workgroup.id.y() + call void asm sideeffect "; use $0", "s"(i32 %val5) + + %val6 = call i32 @llvm.amdgcn.workgroup.id.z() + call void asm sideeffect "; use $0", "s"(i32 %val6) + + ret void +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #0 +declare i32 @llvm.amdgcn.workgroup.id.y() #0 +declare i32 @llvm.amdgcn.workgroup.id.z() #0 +declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0 +declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0 +declare i64 @llvm.amdgcn.dispatch.id() #0 +declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind noinline } Index: test/CodeGen/AMDGPU/callee-special-input-vgprs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -0,0 +1,651 @@ +; RUN: llc -amdgpu-function-calls -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s + +; GCN-LABEL: {{^}}use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_x() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_y: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_y() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_z: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_z() #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xy: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xy() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xyz: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xyz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + store volatile i32 %val2, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_xz: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_xz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + %val1 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}use_workitem_id_yz: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @use_workitem_id_yz() #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.y() + %val1 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val0, i32 addrspace(1)* undef + store volatile i32 %val1, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y: +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: v_mov_b32_e32 v0, v1 +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z: +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: v_mov_b32_e32 v0, v2 +; GCN-NOT: v0 +; GCN-NOT: v2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_x() #1 { + call void @use_workitem_id_x() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_y: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_y() #1 { + call void @use_workitem_id_y() + ret void +} + +; GCN-LABEL: {{^}}func_indirect_use_workitem_id_z: +; GCN-NOT: v0 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define void @func_indirect_use_workitem_id_z() #1 { + call void @use_workitem_id_z() + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_x: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +define void @other_arg_use_workitem_id_x(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_y: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +define void @other_arg_use_workitem_id_y(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}other_arg_use_workitem_id_z: +; GCN: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 +define void @other_arg_use_workitem_id_z(i32 %arg0) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x: +; GCN: v_mov_b32_e32 v1, v0 +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 { + call void @other_arg_use_workitem_id_x(i32 555) + ret void +} + + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y: +; GCN-NOT: v1 +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN-NOT: v1 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 { + call void @other_arg_use_workitem_id_y(i32 555) + ret void +} + +; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z: +; GCN: v_mov_b32_e32 v0, 0x22b +; GCN: v_mov_b32_e32 v1, v2 +; GCN: s_swappc_b64 +; GCN-NOT: v0 +define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 { + call void @other_arg_use_workitem_id_z(i32 555) + ret void +} + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 + +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + + ret void +} + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: +; GCN: s_mov_b32 s33, s5 +; GCN: s_mov_b32 s32, s33 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN: s_mov_b32 s4, s33 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { + call void @too_many_args_use_workitem_id_x( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: +; GCN-NOT: s32 +; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN: s_swappc_b64 +define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { + store volatile i32 %arg0, i32 addrspace(1)* undef + call void @too_many_args_use_workitem_id_x( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; Requires loading and storing to stack slot. +; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 +; GCN: s_add_u32 s32, s32, 0x300{{$}} + +; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}} + +; GCN: s_swappc_b64 + +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload +; GCN: s_sub_u32 s32, s32, 0x300{{$}} +; GCN: s_setpc_b64 +define void @too_many_args_call_too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + call void @too_many_args_use_workitem_id_x( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) + ret void +} + +; stack layout: +; frame[0] = emergency stack slot +; frame[1] = byval arg32 +; frame[2] = stack passed workitem ID x +; frame[3] = VGPR spill slot + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval: +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32 +; GCN: buffer_load_dword v0, off, s[0:3], s5 offset:4 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload +; GCN: s_setpc_b64 +define void @too_many_args_use_workitem_id_x_byval( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 { + %val = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + %private = load volatile i32, i32* %arg32 + ret void +} + +; frame[0] = emergency stack slot +; frame[1] = + +; sp[0] = callee emergency stack slot reservation +; sp[1] = byval +; sp[2] = ?? +; sp[3] = stack passed workitem ID x + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: +; GCN: s_mov_b32 s33, s7 +; GCN: s_add_u32 s32, s33, 0x200{{$}} + +; GCN-DAG: s_add_u32 s32, s32, 0x100{{$}} +; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 + +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 { + %alloca = alloca i32, align 4 + store volatile i32 999, i32* %alloca + call void @too_many_args_use_workitem_id_x_byval( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320, + i32* %alloca) + ret void +} + +; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12 + +; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} +; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]], +; GCN: s_swappc_b64 +define void @func_call_too_many_args_use_workitem_id_x_byval() #1 { + %alloca = alloca i32, align 4 + store volatile i32 999, i32* %alloca + call void @too_many_args_use_workitem_id_x_byval( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320, + i32* %alloca) + ret void +} + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz: +; GCN: s_mov_b32 s5, s32 +; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:12{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32 + +; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @too_many_args_use_workitem_id_xyz( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31) #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val0, i32 addrspace(1)* undef + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val1, i32 addrspace(1)* undef + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val2, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + store volatile i32 %arg31, i32 addrspace(1)* undef + + ret void +} + +; frame[0] = kernel emergency stack slot +; frame[1] = callee emergency stack slot +; frame[2] = ID X +; frame[3] = ID Y +; frame[4] = ID Z + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: +; GCN: s_mov_b32 s33, s5 +; GCN: s_mov_b32 s32, s33 + +; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 { + call void @too_many_args_use_workitem_id_xyz( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310, i32 320) + ret void +} + +; workitem ID X in register, yz on stack +; v31 = workitem ID X +; frame[0] = emergency slot +; frame[1] = workitem Y +; frame[2] = workitem Z + +; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz: +; GCN: s_mov_b32 s5, s32 +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 +; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:4{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 +; GCN: buffer_load_dword v31, off, s[0:3], s5 offset:8{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31 + +; GCN: s_waitcnt +; GCN-NEXT: s_setpc_b64 +; GCN: ScratchSize: 12 +define void @too_many_args_use_workitem_id_x_stack_yz( + i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, + i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15, + i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23, + i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30) #1 { + %val0 = call i32 @llvm.amdgcn.workitem.id.x() + store volatile i32 %val0, i32 addrspace(1)* undef + %val1 = call i32 @llvm.amdgcn.workitem.id.y() + store volatile i32 %val1, i32 addrspace(1)* undef + %val2 = call i32 @llvm.amdgcn.workitem.id.z() + store volatile i32 %val2, i32 addrspace(1)* undef + + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + store volatile i32 %arg3, i32 addrspace(1)* undef + store volatile i32 %arg4, i32 addrspace(1)* undef + store volatile i32 %arg5, i32 addrspace(1)* undef + store volatile i32 %arg6, i32 addrspace(1)* undef + store volatile i32 %arg7, i32 addrspace(1)* undef + + store volatile i32 %arg8, i32 addrspace(1)* undef + store volatile i32 %arg9, i32 addrspace(1)* undef + store volatile i32 %arg10, i32 addrspace(1)* undef + store volatile i32 %arg11, i32 addrspace(1)* undef + store volatile i32 %arg12, i32 addrspace(1)* undef + store volatile i32 %arg13, i32 addrspace(1)* undef + store volatile i32 %arg14, i32 addrspace(1)* undef + store volatile i32 %arg15, i32 addrspace(1)* undef + + store volatile i32 %arg16, i32 addrspace(1)* undef + store volatile i32 %arg17, i32 addrspace(1)* undef + store volatile i32 %arg18, i32 addrspace(1)* undef + store volatile i32 %arg19, i32 addrspace(1)* undef + store volatile i32 %arg20, i32 addrspace(1)* undef + store volatile i32 %arg21, i32 addrspace(1)* undef + store volatile i32 %arg22, i32 addrspace(1)* undef + store volatile i32 %arg23, i32 addrspace(1)* undef + + store volatile i32 %arg24, i32 addrspace(1)* undef + store volatile i32 %arg25, i32 addrspace(1)* undef + store volatile i32 %arg26, i32 addrspace(1)* undef + store volatile i32 %arg27, i32 addrspace(1)* undef + store volatile i32 %arg28, i32 addrspace(1)* undef + store volatile i32 %arg29, i32 addrspace(1)* undef + store volatile i32 %arg30, i32 addrspace(1)* undef + + ret void +} + +; frame[0] = kernel emergency stack slot +; frame[1] = callee emergency stack slot +; frame[2] = ID Y +; frame[3] = ID Z + +; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: +; GCN: s_mov_b32 s33, s5 +; GCN: s_mov_b32 s32, s33 + +; GCN-DAG: v_mov_b32_e32 v31, v0 +; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8 +; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12 +; GCN: s_swappc_b64 +define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { + call void @too_many_args_use_workitem_id_x_stack_yz( + i32 10, i32 20, i32 30, i32 40, + i32 50, i32 60, i32 70, i32 80, + i32 90, i32 100, i32 110, i32 120, + i32 130, i32 140, i32 150, i32 160, + i32 170, i32 180, i32 190, i32 200, + i32 210, i32 220, i32 230, i32 240, + i32 250, i32 260, i32 270, i32 280, + i32 290, i32 300, i32 310) + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 +declare i32 @llvm.amdgcn.workitem.id.y() #0 +declare i32 @llvm.amdgcn.workitem.id.z() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind noinline }