Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -695,7 +695,7 @@ OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); - OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4); } Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -16,6 +16,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/Target/TargetLowering.h" namespace llvm { @@ -113,8 +114,6 @@ SmallVectorImpl &Results) const; void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl &Ins) const; - void AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl &Ins) const; void AnalyzeReturn(CCState &State, const SmallVectorImpl &Outs) const; @@ -160,6 +159,7 @@ bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; + static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -861,11 +861,6 @@ } } -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, - const SmallVectorImpl &Ins) const { - State.AnalyzeFormalArguments(Ins, CC_AMDGPU); -} - void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, const SmallVectorImpl &Outs) const { @@ -885,6 +880,24 @@ // Target specific lowering //===---------------------------------------------------------------------===// +/// Selects the correct CCAssignFn for a given CallingConvention value. +CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) { + switch (CC) { + case CallingConv::C: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + return CC_AMDGPU_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return CC_AMDGPU; + default: + report_fatal_error("Unsupported calling convention."); + } +} + SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SDValue Callee = CLI.Callee; Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1540,7 +1540,7 @@ SmallVector LocalIns; if (AMDGPU::isShader(CallConv)) { - AnalyzeFormalArguments(CCInfo, Ins); + CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); } else { analyzeFormalArgumentsCompute(CCInfo, Ins); } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -21,11 +21,13 @@ namespace llvm { class SITargetLowering final : public AMDGPUTargetLowering { - SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, - unsigned Offset) const; - SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, - SDValue Chain, unsigned Offset, bool Signed, - const ISD::InputArg *Arg = nullptr) const; + SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, + SDValue Chain, uint64_t Offset) const; + SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + uint64_t Offset, bool Signed, + const ISD::InputArg *Arg = nullptr) const; + SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, @@ -55,6 +57,10 @@ const SDLoc &DL, EVT VT) const; + SDValue convertArgType( + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, + bool Signed, const ISD::InputArg *Arg = nullptr) const; + /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -840,13 +840,15 @@ return TargetLowering::isTypeDesirableForOp(Op, VT); } -SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG, - const SDLoc &SL, SDValue Chain, - unsigned Offset) const { +SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, + const SDLoc &SL, + SDValue Chain, + uint64_t Offset) const { const DataLayout &DL = DAG.getDataLayout(); MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, + SIRegisterInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); @@ -856,24 +858,10 @@ DAG.getConstant(Offset, SL, PtrVT)); } -SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - unsigned Offset, bool Signed, +SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Val, + bool Signed, const ISD::InputArg *Arg) const { - const DataLayout &DL = DAG.getDataLayout(); - Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - - unsigned Align = DL.getABITypeAlignment(Ty); - - SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, - MachineMemOperand::MONonTemporal | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); - - SDValue Val = Load; if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -887,360 +875,434 @@ else Val = DAG.getZExtOrTrunc(Val, SL, VT); - return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); + return Val; } -SDValue SITargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, const SDLoc &DL, - SelectionDAG &DAG, SmallVectorImpl &InVals) const { - const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); - - MachineFunction &MF = DAG.getMachineFunction(); - FunctionType *FType = MF.getFunction()->getFunctionType(); - SIMachineFunctionInfo *Info = MF.getInfo(); - const SISubtarget &ST = MF.getSubtarget(); +SDValue SITargetLowering::lowerKernargMemParameter( + SelectionDAG &DAG, EVT VT, EVT MemVT, + const SDLoc &SL, SDValue Chain, + uint64_t Offset, bool Signed, + const ISD::InputArg *Arg) const { + const DataLayout &DL = DAG.getDataLayout(); + Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); + PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); + MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); - if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { - const Function *Fn = MF.getFunction(); - DiagnosticInfoUnsupported NoGraphicsHSA( - *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); - DAG.getContext()->diagnose(NoGraphicsHSA); - return DAG.getEntryNode(); - } + unsigned Align = DL.getABITypeAlignment(Ty); - // Create stack objects that are used for emitting debugger prologue if - // "amdgpu-debugger-emit-prologue" attribute was specified. - if (ST.debuggerEmitPrologue()) - createDebuggerPrologueStackObjects(MF); + SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + MachineMemOperand::MONonTemporal | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); - SmallVector Splits; - BitVector Skipped(Ins.size()); + SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); + return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); +} - for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) { - const ISD::InputArg &Arg = Ins[i]; +static void processShaderInputArgs(SmallVectorImpl &Splits, + CallingConv::ID CallConv, + ArrayRef Ins, + BitVector &Skipped, + FunctionType *FType, + SIMachineFunctionInfo *Info) { + for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { + const ISD::InputArg &Arg = Ins[I]; - // First check if it's a PS input addr + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { - // We can safely skip PS inputs - Skipped.set(i); + // We can safely skip PS inputs. + Skipped.set(I); ++PSInputNum; continue; } Info->markPSInputAllocated(PSInputNum); if (Arg.Used) - Info->PSInputEna |= 1 << PSInputNum; + Info->markPSInputEnabled(PSInputNum); ++PSInputNum; } - if (AMDGPU::isShader(CallConv)) { - // Second split vertices into their elements - if (Arg.VT.isVector()) { - ISD::InputArg NewArg = Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg.VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(Arg); + // Second split vertices into their elements. + if (Arg.VT.isVector()) { + ISD::InputArg NewArg = Arg; + NewArg.Flags.setSplit(); + NewArg.VT = Arg.VT.getVectorElementType(); + + // We REALLY want the ORIGINAL number of vertex elements here, e.g. a + // three or five element vertex only needs three or five registers, + // NOT four or eight. + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + for (unsigned J = 0; J != NumElements; ++J) { + Splits.push_back(NewArg); + NewArg.PartOffset += NewArg.VT.getStoreSize(); } + } else { + Splits.push_back(Arg); } } +} - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, - *DAG.getContext()); +// Allocate special inputs passed in VGPRs. +static void allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasWorkItemIDX()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } - // At least one interpolation mode must be enabled or else the GPU will hang. - // - // Check PSInputAddr instead of PSInputEna. The idea is that if the user set - // PSInputAddr, the user wants to enable some bits after the compilation - // based on run-time states. Since we can't know what the final PSInputEna - // will look like, so we shouldn't do anything here and the user should take - // responsibility for the correct programming. - // - // Otherwise, the following restrictions apply: - // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. - // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be - // enabled too. - if (CallConv == CallingConv::AMDGPU_PS && - ((Info->getPSInputAddr() & 0x7F) == 0 || - ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); - Info->markPSInputAllocated(0); - Info->PSInputEna |= 1; - } - - if (!AMDGPU::isShader(CallConv)) { - assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); - } else { - assert(!Info->hasDispatchPtr() && - !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && - !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && - !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + if (Info.hasWorkItemIDY()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } - if (Info->hasPrivateMemoryInputPtr()) { - unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI); - MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass); + if (Info.hasWorkItemIDZ()) { + unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } +} + +// Allocate special inputs passed in user SGPRs. +static void allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { + if (Info.hasPrivateMemoryInputPtr()) { + unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI); + MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(PrivateMemoryPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info->hasPrivateSegmentBuffer()) { - unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); - MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + if (Info.hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info->hasDispatchPtr()) { - unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + if (Info.hasDispatchPtr()) { + unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } - if (Info->hasQueuePtr()) { - unsigned QueuePtrReg = Info->addQueuePtr(*TRI); + if (Info.hasQueuePtr()) { + unsigned QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info->hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + if (Info.hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(InputPtrReg); } - if (Info->hasDispatchID()) { - unsigned DispatchIDReg = Info->addDispatchID(*TRI); + if (Info.hasDispatchID()) { + unsigned DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info->hasFlatScratchInit()) { - unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI); + if (Info.hasFlatScratchInit()) { + unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } - if (!AMDGPU::isShader(CallConv)) - analyzeFormalArgumentsCompute(CCInfo, Ins); - else - AnalyzeFormalArguments(CCInfo, Splits); - - SmallVector Chains; - - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { - const ISD::InputArg &Arg = Ins[i]; - if (Skipped[i]) { - InVals.push_back(DAG.getUNDEF(Arg.VT)); - continue; - } - - CCValAssign &VA = ArgLocs[ArgIdx++]; - MVT VT = VA.getLocVT(); - - if (VA.isMemLoc()) { - VT = Ins[i].VT; - EVT MemVT = VA.getLocVT(); - const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + - VA.getLocMemOffset(); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes. - SDValue Arg = LowerParameter(DAG, VT, MemVT, DL, Chain, - Offset, Ins[i].Flags.isSExt(), - &Ins[i]); - Chains.push_back(Arg.getValue(1)); - - auto *ParamTy = - dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); - if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - // On SI local pointers are just offsets into LDS, so they are always - // less than 16-bits. On CI and newer they could potentially be - // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); - } - - InVals.push_back(Arg); - Info->setABIArgOffset(Offset + MemVT.getStoreSize()); - continue; - } - assert(VA.isRegLoc() && "Parameter must be in a register!"); - - unsigned Reg = VA.getLocReg(); - const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); - - Reg = MF.addLiveIn(Reg, RC); - SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - - if (Arg.VT.isVector()) { - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - Regs.push_back(Copy); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); - continue; - } - - InVals.push_back(Val); - } - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. +} - // Start adding system SGPRs. - if (Info->hasWorkGroupIDX()) { - unsigned Reg = Info->addWorkGroupIDX(); +// Allocate special input registers that are initialized per-wave. +static void allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + bool IsShader) { + if (Info.hasWorkGroupIDX()) { + unsigned Reg = Info.addWorkGroupIDX(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupIDY()) { - unsigned Reg = Info->addWorkGroupIDY(); + if (Info.hasWorkGroupIDY()) { + unsigned Reg = Info.addWorkGroupIDY(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupIDZ()) { - unsigned Reg = Info->addWorkGroupIDZ(); + if (Info.hasWorkGroupIDZ()) { + unsigned Reg = Info.addWorkGroupIDZ(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasWorkGroupInfo()) { - unsigned Reg = Info->addWorkGroupInfo(); + if (Info.hasWorkGroupInfo()) { + unsigned Reg = Info.addWorkGroupInfo(); MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); CCInfo.AllocateReg(Reg); } - if (Info->hasPrivateSegmentWaveByteOffset()) { + if (Info.hasPrivateSegmentWaveByteOffset()) { // Scratch wave offset passed in system SGPR. unsigned PrivateSegmentWaveByteOffsetReg; - if (AMDGPU::isShader(CallConv)) { + if (IsShader) { PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo); - Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); + Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg); } else - PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset(); + PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset(); MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); } +} +static void reservePrivateMemoryRegs(const TargetMachine &TM, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. bool HasStackObjects = MF.getFrameInfo().hasStackObjects(); + // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. if (HasStackObjects) - Info->setHasNonSpillStackObjects(true); + Info.setHasNonSpillStackObjects(true); // Everything live out of a block is spilled with fast regalloc, so it's // almost certain that spilling will be required. - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + if (TM.getOptLevel() == CodeGenOpt::None) HasStackObjects = true; + const SISubtarget &ST = MF.getSubtarget(); if (ST.isAmdCodeObjectV2(MF)) { if (HasStackObjects) { // If we have stack objects, we unquestionably need the private buffer // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); - Info->setScratchRSrcReg(PrivateSegmentBufferReg); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); - unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); } else { unsigned ReservedBufferReg - = TRI->reservedPrivateSegmentBufferReg(MF); + = TRI.reservedPrivateSegmentBufferReg(MF); unsigned ReservedOffsetReg - = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); // We tentatively reserve the last registers (skipping the last two // which may contain VCC). After register allocation, we'll replace // these with the ones immediately after those which were really // allocated. In the prologue copies will be inserted from the argument // to these reserved registers. - Info->setScratchRSrcReg(ReservedBufferReg); - Info->setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setScratchRSrcReg(ReservedBufferReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } else { - unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. - Info->setScratchRSrcReg(ReservedBufferReg); + Info.setScratchRSrcReg(ReservedBufferReg); if (HasStackObjects) { - unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue( MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); } else { unsigned ReservedOffsetReg - = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - Info->setScratchWaveOffsetReg(ReservedOffsetReg); + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } +} - if (Info->hasWorkItemIDX()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); +SDValue SITargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, const SDLoc &DL, + SelectionDAG &DAG, SmallVectorImpl &InVals) const { + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + + MachineFunction &MF = DAG.getMachineFunction(); + FunctionType *FType = MF.getFunction()->getFunctionType(); + SIMachineFunctionInfo *Info = MF.getInfo(); + const SISubtarget &ST = MF.getSubtarget(); + + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA( + *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); + DAG.getContext()->diagnose(NoGraphicsHSA); + return DAG.getEntryNode(); } - if (Info->hasWorkItemIDY()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); + + SmallVector Splits; + SmallVector ArgLocs; + BitVector Skipped(Ins.size()); + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, + *DAG.getContext()); + + bool IsShader = AMDGPU::isShader(CallConv); + bool IsKernel = !IsShader; + bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); + + if (IsShader) { + processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); + + // At least one interpolation mode must be enabled or else the GPU will + // hang. + // + // Check PSInputAddr instead of PSInputEnable. The idea is that if the user + // set PSInputAddr, the user wants to enable some bits after the compilation + // based on run-time states. Since we can't know what the final PSInputEna + // will look like, so we shouldn't do anything here and the user should take + // responsibility for the correct programming. + // + // Otherwise, the following restrictions apply: + // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. + // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be + // enabled too. + if (CallConv == CallingConv::AMDGPU_PS && + ((Info->getPSInputAddr() & 0x7F) == 0 || + ((Info->getPSInputAddr() & 0xF) == 0 && + Info->isPSInputAllocated(11)))) { + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + Info->markPSInputAllocated(0); + Info->markPSInputEnabled(0); + } + + assert(!Info->hasDispatchPtr() && + !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() && + !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && + !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && + !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && + !Info->hasWorkItemIDZ()); + } else { + assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX())); } - if (Info->hasWorkItemIDZ()) { - unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - CCInfo.AllocateReg(Reg); + if (IsEntryFunc) { + allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); + allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); } - if (Chains.empty()) - return Chain; + if (IsKernel) { + analyzeFormalArgumentsCompute(CCInfo, Ins); + } else { + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); + CCInfo.AnalyzeFormalArguments(Splits, AssignFn); + } + + SmallVector Chains; + + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + const ISD::InputArg &Arg = Ins[i]; + if (Skipped[i]) { + InVals.push_back(DAG.getUNDEF(Arg.VT)); + continue; + } + + CCValAssign &VA = ArgLocs[ArgIdx++]; + MVT VT = VA.getLocVT(); + + if (IsEntryFunc && VA.isMemLoc()) { + VT = Ins[i].VT; + EVT MemVT = VA.getLocVT(); + + const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) + + VA.getLocMemOffset(); + Info->setABIArgOffset(Offset + MemVT.getStoreSize()); + + // The first 36 bytes of the input buffer contains information about + // thread group and global sizes. + SDValue Arg = lowerKernargMemParameter( + DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]); + Chains.push_back(Arg.getValue(1)); + + auto *ParamTy = + dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); + if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS && + ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + // On SI local pointers are just offsets into LDS, so they are always + // less than 16-bits. On CI and newer they could potentially be + // real pointers, so we can't guarantee their size. + Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, + DAG.getValueType(MVT::i16)); + } + + InVals.push_back(Arg); + continue; + } + + if (VA.isMemLoc()) + report_fatal_error("memloc not supported with calling convention"); + + assert(VA.isRegLoc() && "Parameter must be in a register!"); + + unsigned Reg = VA.getLocReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + + Reg = MF.addLiveIn(Reg, RC); + SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + + if (Arg.VT.isVector()) { + // Build a vector from the registers + Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); + unsigned NumElements = ParamType->getVectorNumElements(); + + SmallVector Regs; + Regs.push_back(Val); + for (unsigned j = 1; j != NumElements; ++j) { + Reg = ArgLocs[ArgIdx++].getLocReg(); + Reg = MF.addLiveIn(Reg, RC); + + SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); + Regs.push_back(Copy); + } + + // Fill up the missing vector elements + NumElements = Arg.VT.getVectorNumElements() - NumElements; + Regs.append(NumElements, DAG.getUNDEF(VT)); + + InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); + continue; + } + + InVals.push_back(Val); + } + + // Start adding system SGPRs. + if (IsEntryFunc) + allocateSystemSGPRs(CCInfo, MF, *Info, IsShader); + + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + return Chains.empty() ? Chain : + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } SDValue @@ -2603,8 +2665,8 @@ MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, false); + SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -2662,7 +2724,7 @@ } case Intrinsic::amdgcn_implicitarg_ptr: { unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT); - return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset); + return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset); } case Intrinsic::amdgcn_kernarg_segment_ptr: { unsigned Reg @@ -2704,38 +2766,38 @@ if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); + return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), + SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -112,6 +112,8 @@ // Graphics info. unsigned PSInputAddr; + unsigned PSInputEnable; + bool ReturnsVoid; // A pair of default/requested minimum/maximum flat work group sizes. @@ -133,9 +135,6 @@ public: // FIXME: Make private unsigned LDSWaveSpillSize; - unsigned PSInputEna; - - unsigned ScratchOffsetReg; unsigned NumUserSGPRs; unsigned NumSystemSGPRs; @@ -423,6 +422,10 @@ return PSInputAddr; } + unsigned getPSInputEnable() const { + return PSInputEnable; + } + bool isPSInputAllocated(unsigned Index) const { return PSInputAddr & (1 << Index); } @@ -431,6 +434,10 @@ PSInputAddr |= 1 << Index; } + void markPSInputEnabled(unsigned Index) { + PSInputEnable |= 1 << Index; + } + bool returnsVoid() const { return ReturnsVoid; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -41,13 +41,13 @@ WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), PSInputAddr(0), + PSInputEnable(0), ReturnsVoid(true), FlatWorkGroupSizes(0, 0), WavesPerEU(0, 0), DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}), DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}), LDSWaveSpillSize(0), - PSInputEna(0), NumUserSGPRs(0), NumSystemSGPRs(0), HasSpilledSGPRs(false), Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -252,8 +252,15 @@ unsigned getInitialPSInputAddr(const Function &F); -bool isShader(CallingConv::ID cc); -bool isCompute(CallingConv::ID cc); +LLVM_READNONE +bool isShader(CallingConv::ID CC); + +LLVM_READNONE +bool isCompute(CallingConv::ID CC); + +LLVM_READNONE +bool isEntryFunctionCC(CallingConv::ID CC); + bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -516,6 +516,10 @@ return !isShader(cc) || cc == CallingConv::AMDGPU_CS; } +bool isEntryFunctionCC(CallingConv::ID CC) { + return true; +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } Index: test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll +++ test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll @@ -13,9 +13,10 @@ ; GCN-LABEL: {{^}}kernel_cc: ; GCN: s_endpgm -define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { +define amdgpu_kernel void @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { %vi = bitcast float %v to i32 %x = add i32 %vi, %w %xf = bitcast i32 %x to float - ret float %xf + store float %xf, float addrspace(1)* undef + ret void }