diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -167,6 +167,10 @@ GCCBuiltin<"__builtin_amdgcn_dispatch_id">, Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +def int_amdgcn_lds_kernel_id : + GCCBuiltin<"__builtin_amdgcn_lds_kernel_id">, + Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + def int_amdgcn_implicit_buffer_ptr : GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, Intrinsic<[LLVMQualPointerType], [], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -103,6 +103,7 @@ KERNARG_SEGMENT_PTR = 3, DISPATCH_ID = 4, FLAT_SCRATCH_INIT = 5, + LDS_KERNEL_ID = 6, WORKGROUP_ID_X = 10, WORKGROUP_ID_Y = 11, WORKGROUP_ID_Z = 12, @@ -128,6 +129,7 @@ ArgDescriptor DispatchID; ArgDescriptor FlatScratchInit; ArgDescriptor PrivateSegmentSize; + ArgDescriptor LDSKernelId; // System SGPRs in kernels. ArgDescriptor WorkGroupIDX; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp @@ -74,6 +74,7 @@ << " WorkGroupIDY: " << FI.second.WorkGroupIDY << " WorkGroupIDZ: " << FI.second.WorkGroupIDZ << " WorkGroupInfo: " << FI.second.WorkGroupInfo + << " LDSKernelId: " << FI.second.LDSKernelId << " PrivateSegmentWaveByteOffset: " << FI.second.PrivateSegmentWaveByteOffset << " ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr @@ -107,6 +108,9 @@ case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr, &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); + case AMDGPUFunctionArgInfo::LDS_KERNEL_ID: + return std::make_tuple(LDSKernelId ? &LDSKernelId : nullptr, + &AMDGPU::SGPR_32RegClass, LLT::scalar(32)); case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: return std::make_tuple( PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr, @@ -162,6 +166,7 @@ AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12); AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13); AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14); + AI.LDSKernelId = ArgDescriptor::createRegister(AMDGPU::SGPR15); const unsigned Mask = 0x3ff; AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def @@ -27,5 +27,6 @@ AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x") AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y") AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z") +AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id") #undef AMDGPU_ATTRIBUTE diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -72,6 +72,8 @@ case Intrinsic::amdgcn_workgroup_id_z: case Intrinsic::r600_read_tgid_z: return WORKGROUP_ID_Z; + case Intrinsic::amdgcn_lds_kernel_id: + return LDS_KERNEL_ID; case Intrinsic::amdgcn_dispatch_ptr: return DISPATCH_PTR; case Intrinsic::amdgcn_dispatch_id: @@ -457,6 +459,10 @@ removeAssumedBits(QUEUE_PTR); } + if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) { + removeAssumedBits(LDS_KERNEL_ID); + } + return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED; } @@ -591,6 +597,16 @@ return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this, UsedAssumedInformation); } + + bool funcRetrievesLDSKernelId(Attributor &A) { + auto DoesNotRetrieve = [&](Instruction &I) { + auto &Call = cast(I); + return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id; + }; + bool UsedAssumedInformation = false; + return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this, + UsedAssumedInformation); + } }; AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -765,7 +765,8 @@ AMDGPUFunctionArgInfo::DISPATCH_ID, AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID, }; static constexpr StringLiteral ImplicitAttrNames[] = { @@ -775,7 +776,8 @@ "amdgpu-no-dispatch-id", "amdgpu-no-workgroup-id-x", "amdgpu-no-workgroup-id-y", - "amdgpu-no-workgroup-id-z" + "amdgpu-no-workgroup-id-z", + "amdgpu-no-lds-kernel-id", }; MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -149,6 +149,13 @@ bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const; + + bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + + bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const; + bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3864,6 +3864,38 @@ return true; } +bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Function &F = B.getMF().getFunction(); + ConstantInt *KnownSize = AMDGPUMachineFunction::getLDSKernelIdMetadata(F); + if (KnownSize) { + B.buildCopy(DstReg, + B.buildConstant(LLT::scalar(32), KnownSize->getZExtValue())); + return true; + } else { + return false; + } +} + +bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo(); + if (!MFI->isEntryFunction()) { + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID); + } + + Register DstReg = MI.getOperand(0).getReg(); + if (!getLDSKernelId(DstReg, MRI, B)) + return false; + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, @@ -5260,6 +5292,9 @@ case Intrinsic::amdgcn_workgroup_id_z: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_lds_kernel_id: + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID); case Intrinsic::amdgcn_dispatch_ptr: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_PTR); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -163,6 +163,22 @@ initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry()); } + static std::vector getKernels(Module &M) { + std::vector OrderedKernels; + for (Function &F : M.functions()) { + if (F.isDeclaration()) + continue; + if (!AMDGPU::isKernel(F.getCallingConv())) + continue; + OrderedKernels.push_back(&F); + } + std::sort(OrderedKernels.begin(), OrderedKernels.end(), + [](const Function *X, const Function *Y) -> bool { + return X->getName() < Y->getName(); + }); + return OrderedKernels; + } + bool runOnModule(Module &M) override { CallGraph CG = CallGraph(M); UsedList = getUsedList(M); @@ -179,6 +195,21 @@ Changed |= processUsedLDS(CG, M, &F); } + std::vector OrderedKernels = getKernels(M); + { + LLVMContext &Ctx = M.getContext(); + IRBuilder<> Builder(Ctx); + for (size_t i = 0; i < OrderedKernels.size(); i++) { + // todo: error if > 2**32 kernels + llvm::Metadata *AttrMDArgs[1] = { + llvm::ConstantAsMetadata::get(Builder.getInt32(i)), + }; + OrderedKernels[i]->setMetadata("llvm.amdgcn.lds_kernel_id", + llvm::MDNode::get(Ctx, AttrMDArgs)); + } + } + Changed = true; + UsedList.clear(); return Changed; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -104,6 +104,8 @@ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV); void allocateModuleLDSGlobal(const Function &F); + static ConstantInt *getLDSKernelIdMetadata(const Function &F); + Align getDynLDSAlign() const { return DynLDSAlign; } void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -11,6 +11,7 @@ #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/Constants.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -101,6 +102,20 @@ } } +ConstantInt *AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) { + auto MD = F.getMetadata("llvm.amdgcn.lds_kernel_id"); + if (MD && MD->getNumOperands() == 1) { + ConstantInt *KnownSize = mdconst::extract(MD->getOperand(0)); + if (KnownSize) { + uint64_t v = KnownSize->getZExtValue(); + if (v <= UINT32_MAX) { + return KnownSize; + } + } + } + return nullptr; +} + void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV) { assert(DL.getTypeAllocSize(GV.getValueType()).isZero()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -734,7 +734,9 @@ 2 + // kernel segment ptr 2 + // dispatch ID 2 + // flat scratch init - 2; // Implicit buffer ptr + 2 + // Implicit buffer ptr + 1; // LDSKernelId + // Max number of system SGPRs unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX 1 + // WorkGroupIDY diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1523,6 +1523,9 @@ parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.PrivateSegmentSize, 0, 0) || + parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId, + AMDGPU::SGPR_32RegClass, + MFI->ArgInfo.LDSKernelId, 0, 1) || parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX, AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX, 0, 1) || diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -526,6 +526,18 @@ PreloadedScratchRsrcReg, ScratchRsrcReg, ScratchWaveOffsetReg); } + + if (MFI->hasLDSKernelId()) { + // TODO: Initialise reg somewhere that is easier to elide + // If the metadata is missing, the kernel can't initialise the ID + // register + ConstantInt *v = AMDGPUMachineFunction::getLDSKernelIdMetadata(F); + if (v) { + Register Reg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::LDS_KERNEL_ID); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Reg) + .addImm(v->getZExtValue()); + } + } } // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -48,6 +48,7 @@ SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; + SDValue getLDSKernelId(SelectionDAG &DAG, const SDLoc &SL) const; SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, uint64_t Offset, Align Alignment, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1634,6 +1634,19 @@ return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset); } +SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG, + const SDLoc &SL) const { + + Function &F = DAG.getMachineFunction().getFunction(); + ConstantInt *KnownSize = AMDGPUMachineFunction::getLDSKernelIdMetadata(F); + if (KnownSize) { + return DAG.getConstant(KnownSize->getZExtValue(), SL, MVT::i32); + } else { + LLVM_DEBUG(dbgs() << " Failed to lower LDSKernelId\n";); + return SDValue(); + } +} + SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, @@ -2019,6 +2032,9 @@ if (Info.hasWorkGroupIDZ()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); + + if (Info.hasLDSKernelId()) + allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId); } // Allocate special inputs passed in user SGPRs. @@ -2072,6 +2088,12 @@ CCInfo.AllocateReg(FlatScratchInitReg); } + if (Info.hasLDSKernelId()) { + Register Reg = Info.addLDSKernelId(); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read // these from the dispatch pointer. } @@ -2297,8 +2319,8 @@ (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) && !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); } if (CallConv == CallingConv::AMDGPU_PS) { @@ -2712,7 +2734,8 @@ {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, - {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"} + {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}, + {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"}, }; for (auto Attr : ImplicitAttrs) { @@ -6774,6 +6797,12 @@ case Intrinsic::amdgcn_workgroup_id_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); + case Intrinsic::amdgcn_lds_kernel_id: { + if (MFI->isEntryFunction()) + return getLDSKernelId(DAG, DL); + return getPreloadedValue(DAG, *MFI, VT, + AMDGPUFunctionArgInfo::LDS_KERNEL_ID); + } case Intrinsic::amdgcn_workitem_id_x: return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -190,6 +190,7 @@ Optional WorkGroupIDY; Optional WorkGroupIDZ; Optional WorkGroupInfo; + Optional LDSKernelId; Optional PrivateSegmentWaveByteOffset; Optional ImplicitArgPtr; @@ -214,6 +215,7 @@ YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY); YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ); YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo); + YamlIO.mapOptional("LDSKernelId", AI.LDSKernelId); YamlIO.mapOptional("privateSegmentWaveByteOffset", AI.PrivateSegmentWaveByteOffset); @@ -417,6 +419,7 @@ bool WorkGroupIDY : 1; bool WorkGroupIDZ : 1; bool WorkGroupInfo : 1; + bool LDSKernelId : 1; bool PrivateSegmentWaveByteOffset : 1; bool WorkItemIDX : 1; // Always initialized. @@ -610,6 +613,7 @@ Register addDispatchID(const SIRegisterInfo &TRI); Register addFlatScratchInit(const SIRegisterInfo &TRI); Register addImplicitBufferPtr(const SIRegisterInfo &TRI); + Register addLDSKernelId(); // Add system SGPRs. Register addWorkGroupIDX() { @@ -700,6 +704,8 @@ return WorkGroupInfo; } + bool hasLDSKernelId() const { return LDSKernelId; } + bool hasPrivateSegmentWaveByteOffset() const { return PrivateSegmentWaveByteOffset; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -41,6 +41,7 @@ WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false), + LDSKernelId(false), PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), @@ -132,6 +133,9 @@ ST.getMaxWorkitemID(F, 2) != 0) WorkItemIDZ = true; + if (!F.hasFnAttribute("amdgpu-no-lds-kernel-id")) + LDSKernelId = true; + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) DispatchPtr = true; @@ -251,6 +255,12 @@ return ArgInfo.ImplicitBufferPtr.getRegister(); } +Register SIMachineFunctionInfo::addLDSKernelId() { + ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR()); + NumUserSGPRs += 1; + return ArgInfo.LDSKernelId.getRegister(); +} + bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) { for (unsigned I = 0; CSRegs[I]; ++I) { @@ -551,6 +561,7 @@ Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr); Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID); Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit); + Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId); Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize); Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX); Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll @@ -0,0 +1,33 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.amdgcn.lds.kernel.id() +declare i32 @llvm.amdgcn.workgroup.id.x() + +define void @function_lds_id(i32 addrspace(1)* %out) { + %tmp0 = call i32 @llvm.amdgcn.lds.kernel.id() + %help = call i32 @llvm.amdgcn.workgroup.id.x() + %both = add i32 %tmp0, %help + store i32 %both, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @kernel_lds_id(i32 addrspace(1)* %out) !amdgcn.lds_kernel_id !0 { + %tmp0 = call i32 @llvm.amdgcn.lds.kernel.id() + %help = call i32 @llvm.amdgcn.workgroup.id.x() + %both = add i32 %tmp0, %help + store i32 %both, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @indirect_lds_id(i32 addrspace(1)* %out) !amdgcn.lds_kernel_id !0 { + call void @function_lds_id(i32 addrspace(1) * %out) + ret void +} + +define amdgpu_kernel void @doesnt_use_it(i32 addrspace(1)* %out) !amdgcn.lds_kernel_id !0 { + store i32 100, i32 addrspace(1)* %out + ret void +} + + +!0 = !{i32 42}