diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -393,28 +393,29 @@ const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo(); uint16_t KernelCodeProperties = 0; + const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo(); - if (MFI.hasPrivateSegmentBuffer()) { + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } - if (MFI.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { + if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } - if (MFI.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; } - if (MFI.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; } - if (MFI.hasFlatScratchInit()) { + if (UserSGPRInfo.hasFlatScratchInit()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; } @@ -1165,27 +1166,28 @@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, getElementByteSizeValue(STM.getMaxPrivateElementSize(true))); - if (MFI->hasPrivateSegmentBuffer()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo(); + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; } - if (MFI->hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) + if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; - if (MFI->hasKernargSegmentPtr()) + if (UserSGPRInfo.hasKernargSegmentPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; - if (MFI->hasDispatchID()) + if (UserSGPRInfo.hasDispatchID()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; - if (MFI->hasFlatScratchInit()) + if (UserSGPRInfo.hasFlatScratchInit()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; - if (MFI->hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; if (STM.isXNACKEnabled()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -455,27 +455,28 @@ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info.hasPrivateSegmentBuffer()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register InputPtrReg = Info.addKernargSegmentPtr(TRI); const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -486,13 +487,13 @@ CCInfo.AllocateReg(InputPtrReg); } - if (Info.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit()) { + if (UserSGPRInfo.hasFlatScratchInit()) { Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); @@ -597,15 +598,16 @@ SmallVector ArgLocs; CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); - if (Info->hasImplicitBufferPtr()) { + if (UserSGPRInfo.hasImplicitBufferPtr()) { Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: This probably isn't defined for mesa - if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) { + if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) { Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -1097,7 +1097,7 @@ Offset += 8; // Skipped. } - if (MFI.hasQueuePtr()) + if (MFI.getUserSGPRInfo().hasQueuePtr()) emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -13,17 +13,70 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" + #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" using namespace llvm; namespace { +class PreloadKernelArgInfo { +private: + Function &F; + + const GCNSubtarget &ST; + + unsigned NumFreeUserSGPRs; + +public: + SmallVector KernelArgMetadata; + + PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { + setInitialFreeUserSGPRsCount(); + } + + // Returns the maximum number of user SGPRs that we have available to preload + // arguments. + void setInitialFreeUserSGPRsCount() { + const unsigned MaxUserSGRPs = ST.getMaxNumUserSGPRs(); + GCNUserSGPRUsageInfo UserSGPRInfo(F, ST); + + NumFreeUserSGPRs = MaxUserSGRPs - UserSGPRInfo.getNumUsedUserSGPRs(); + } + + unsigned allocPreloadSGPRs(bool IsInReg, bool InPreloadSequence, + unsigned AllocSize, uint64_t ArgOffset, + uint64_t LastExplicitArgOffset) { + + if (!IsInReg || !InPreloadSequence) + return 0; + + // Check if this arguemnt may be loaded into the same register as the + // previous argument. + if (!isAligned(Align(4), ArgOffset) && AllocSize < 4) + return 1; + + // Pad SGPRs for kernarg alignment. + unsigned Padding = ArgOffset - LastExplicitArgOffset; + unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; + unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4; + if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs) + return 0; + + NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs); + return NumPreloadSGPRs; + } +}; + class AMDGPULowerKernelArguments : public FunctionPass { public: static char ID; @@ -84,8 +137,17 @@ Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; - + // Preloaded kernel arguments must be sequential. + bool InPreloadSequence = true; + bool HasPreloadArgs = false; + PreloadKernelArgInfo PreloadInfo(F, ST); + MDNode *MD = F.getMetadata("preload_kernel_args"); + if (!MD) + InPreloadSequence = false; + + int Idx = -1; for (Argument &Arg : F.args()) { + Idx++; const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; @@ -95,10 +157,31 @@ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset; + uint64_t LastExplicitArgOffset = ExplicitArgOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize; - if (Arg.use_empty()) + if (Arg.use_empty()) { + InPreloadSequence = false; continue; + } + + // Try to preload this argument. + unsigned PreloadSGPRs = PreloadInfo.allocPreloadSGPRs( + Arg.hasInRegAttr(), InPreloadSequence, AllocSize, EltOffset, + LastExplicitArgOffset); + if (PreloadSGPRs && !Arg.getType()->isAggregateType()) { + // Preload this argument. + HasPreloadArgs = true; + MDBuilder MDB(Ctx); + auto *MDIndex = + MDB.createConstant(llvm::ConstantInt::get(Builder.getInt32Ty(), Idx)); + auto *MDAllocSizeSGPRs = MDB.createConstant( + llvm::ConstantInt::get(Builder.getInt32Ty(), PreloadSGPRs)); + PreloadInfo.KernelArgMetadata.push_back( + llvm::MDNode::get(Ctx, {MDIndex, MDAllocSizeSGPRs})); + } else { + InPreloadSequence = false; + } // If this is byval, the loads are already explicit in the function. We just // need to rewrite the pointer values. @@ -223,6 +306,11 @@ } } + if (HasPreloadArgs) { + F.setMetadata("preload_kernel_args", + llvm::MDNode::get(Ctx, PreloadInfo.KernelArgMetadata)); + } + KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -185,7 +185,7 @@ // // If we only have implicit uses of flat_scr on flat instructions, it is not // really needed. - if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && + if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() && (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -17,6 +17,7 @@ #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "R600Subtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -692,7 +693,7 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit()); + return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); } unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { @@ -770,25 +771,27 @@ getReservedNumSGPRs(MF)); } -static unsigned getMaxNumPreloadedSGPRs() { +static constexpr unsigned getMaxNumPreloadedSGPRs() { + using USI = GCNUserSGPRUsageInfo; // Max number of user SGPRs - unsigned MaxUserSGPRs = 4 + // private segment buffer - 2 + // Dispatch ptr - 2 + // queue ptr - 2 + // kernel segment ptr - 2 + // dispatch ID - 2 + // flat scratch init - 2; // Implicit buffer ptr + const unsigned MaxUserSGPRs = + USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + + USI::getNumUserSGPRForField(USI::DispatchPtrID) + + USI::getNumUserSGPRForField(USI::QueuePtrID) + + USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + + USI::getNumUserSGPRForField(USI::DispatchIdID) + + USI::getNumUserSGPRForField(USI::FlatScratchInitID) + + USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); // Max number of system SGPRs - unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX - 1 + // WorkGroupIDY - 1 + // WorkGroupIDZ - 1 + // WorkGroupInfo - 1; // private segment wave byte offset + const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX + 1 + // WorkGroupIDY + 1 + // WorkGroupIDZ + 1 + // WorkGroupInfo + 1; // private segment wave byte offset // Max number of synthetic SGPRs - unsigned SyntheticSGPRs = 1; // LDSKernelId + const unsigned SyntheticSGPRs = 1; // LDSKernelId return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; } @@ -1018,3 +1021,73 @@ else return static_cast(TM.getSubtarget(F)); } + +GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, + const GCNSubtarget &ST) { + const CallingConv::ID CC = F.getCallingConv(); + const bool IsKernel = + CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; + // FIXME: Should have analysis or something rather than attribute to detect + // calls. + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + // FIXME: This attribute is a hack, we just need an analysis on the function + // to look for allocas. + const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + + if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) + KernargSegmentPtr = true; + + bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); + if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) + PrivateSegmentBuffer = true; + else if (ST.isMesaGfxShader(F)) + ImplicitBufferPtr = true; + + if (!AMDGPU::isGraphics(CC)) { + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) + DispatchPtr = true; + + // FIXME: Can this always be disabled with < COv5? + if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) + QueuePtr = true; + + if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) + DispatchID = true; + } + + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. + if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && + (IsAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + FlatScratchInit = true; + } +} + +unsigned GCNUserSGPRUsageInfo::getNumUsedUserSGPRs() const { + unsigned NumUserSGPRs = 0; + if (hasImplicitBufferPtr()) + NumUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); + + if (hasPrivateSegmentBuffer()) + NumUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); + + if (hasDispatchPtr()) + NumUserSGPRs += getNumUserSGPRForField(DispatchPtrID); + + if (hasQueuePtr()) + NumUserSGPRs += getNumUserSGPRForField(QueuePtrID); + + if (hasKernargSegmentPtr()) + NumUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); + + if (hasDispatchID()) + NumUserSGPRs += getNumUserSGPRForField(DispatchIdID); + + if (hasFlatScratchInit()) + NumUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); + + return NumUserSGPRs; +} diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" +#include "llvm/Support/ErrorHandling.h" #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" @@ -1378,6 +1379,79 @@ } }; +class GCNUserSGPRUsageInfo { +public: + unsigned getNumUsedUserSGPRs() const; + + bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; } + + bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; } + + bool hasDispatchPtr() const { return DispatchPtr; } + + bool hasQueuePtr() const { return QueuePtr; } + + bool hasKernargSegmentPtr() const { return KernargSegmentPtr; } + + bool hasDispatchID() const { return DispatchID; } + + bool hasFlatScratchInit() const { return FlatScratchInit; } + + enum UserSGPRID : unsigned { + ImplicitBufferPtrID = 0, + PrivateSegmentBufferID = 1, + DispatchPtrID = 2, + QueuePtrID = 3, + KernargSegmentPtrID = 4, + DispatchIdID = 5, + FlatScratchInitID = 6, + PrivateSegmentSizeID = 7 + }; + + // Returns the size in number of SGPRs for preload user SGPR field. + static constexpr unsigned getNumUserSGPRForField(UserSGPRID ID) { + switch (ID) { + case ImplicitBufferPtrID: + return 2; + case PrivateSegmentBufferID: + return 4; + case DispatchPtrID: + return 2; + case QueuePtrID: + return 2; + case KernargSegmentPtrID: + return 2; + case DispatchIdID: + return 2; + case FlatScratchInitID: + return 2; + case PrivateSegmentSizeID: + return 1; + } + llvm_unreachable("Unknown UserSGPRID."); + } + + GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST); + +private: + // Private memory buffer + // Compute directly in sgpr[0:1] + // Other shaders indirect 64-bits at sgpr[0:1] + bool ImplicitBufferPtr = false; + + bool PrivateSegmentBuffer = false; + + bool DispatchPtr = false; + + bool QueuePtr = false; + + bool KernargSegmentPtr = false; + + bool DispatchID = false; + + bool FlatScratchInit = false; +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -692,7 +692,7 @@ } bool NeedsFlatScratchInit = - MFI->hasFlatScratchInit() && + MFI->getUserSGPRInfo().hasFlatScratchInit() && (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() || (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch())); @@ -775,7 +775,7 @@ // Use relocations to get the pointer, and setup the other bits manually. uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - if (MFI->hasImplicitBufferPtr()) { + if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) { Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) { @@ -814,7 +814,6 @@ BuildMI(MBB, I, DL, SMovB32, Rsrc1) .addExternalSymbol("SCRATCH_RSRC_DWORD1") .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - } BuildMI(MBB, I, DL, SMovB32, Rsrc2) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -2137,13 +2138,14 @@ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); // TODO: Unify handling with private memory pointers. - if (Info.hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); @@ -2152,7 +2154,7 @@ if (Info.hasImplicitArgPtr()) allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); - if (Info.hasDispatchID()) + if (UserSGPRInfo.hasDispatchID()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. @@ -2175,34 +2177,35 @@ MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { - if (Info.hasImplicitBufferPtr()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); + if (UserSGPRInfo.hasImplicitBufferPtr()) { Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info.hasPrivateSegmentBuffer()) { + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register InputPtrReg = Info.addKernargSegmentPtr(TRI); CCInfo.AllocateReg(InputPtrReg); @@ -2211,13 +2214,13 @@ MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); } - if (Info.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { + if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); @@ -2481,12 +2484,13 @@ bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); if (IsGraphics) { - assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && - !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); + assert(!UserSGPRInfo.hasDispatchPtr() && + !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && + !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); if (!Subtarget->enableFlatScratch()) - assert(!Info->hasFlatScratchInit()); + assert(!UserSGPRInfo.hasFlatScratchInit()); if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs()) assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ()); @@ -9073,7 +9077,7 @@ const SIMachineFunctionInfo &Info) { // TODO: Should check if the address can definitely not access stack. if (Info.isEntryFunction()) - return Info.hasFlatScratchInit(); + return Info.getUserSGPRInfo().hasFlatScratchInit(); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -16,6 +16,7 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUMachineFunction.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "SIModeRegisterDefaults.h" @@ -434,13 +435,9 @@ unsigned NumSpilledSGPRs = 0; unsigned NumSpilledVGPRs = 0; - // Feature bits required for inputs passed in user SGPRs. - bool PrivateSegmentBuffer : 1; - bool DispatchPtr : 1; - bool QueuePtr : 1; - bool KernargSegmentPtr : 1; - bool DispatchID : 1; - bool FlatScratchInit : 1; + // Tracks information about user SGPRs that will be setup by hardware which + // will apply to all wavefronts of the grid. + GCNUserSGPRUsageInfo UserSGPRInfo; // Feature bits required for inputs passed in system SGPRs. bool WorkGroupIDX : 1; // Always initialized. @@ -454,11 +451,6 @@ bool WorkItemIDY : 1; bool WorkItemIDZ : 1; - // Private memory buffer - // Compute directly in sgpr[0:1] - // Other shaders indirect 64-bits at sgpr[0:1] - bool ImplicitBufferPtr : 1; - // Pointer to where the ABI inserts special kernel arguments separate from the // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; @@ -599,6 +591,8 @@ return PrologEpilogSGPRSpills; } + const GCNUserSGPRUsageInfo &getUserSGPRInfo() const { return UserSGPRInfo; } + void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI) { PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI)); @@ -778,6 +772,8 @@ return ArgInfo.WorkGroupInfo.getRegister(); } + bool hasLDSKernelId() const { return LDSKernelId; } + // Add special VGPR inputs void setWorkItemIDX(ArgDescriptor Arg) { ArgInfo.WorkItemIDX = Arg; @@ -802,30 +798,6 @@ ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } - bool hasPrivateSegmentBuffer() const { - return PrivateSegmentBuffer; - } - - bool hasDispatchPtr() const { - return DispatchPtr; - } - - bool hasQueuePtr() const { - return QueuePtr; - } - - bool hasKernargSegmentPtr() const { - return KernargSegmentPtr; - } - - bool hasDispatchID() const { - return DispatchID; - } - - bool hasFlatScratchInit() const { - return FlatScratchInit; - } - bool hasWorkGroupIDX() const { return WorkGroupIDX; } @@ -842,8 +814,6 @@ return WorkGroupInfo; } - bool hasLDSKernelId() const { return LDSKernelId; } - bool hasPrivateSegmentWaveByteOffset() const { return PrivateSegmentWaveByteOffset; } @@ -864,10 +834,6 @@ return ImplicitArgPtr; } - bool hasImplicitBufferPtr() const { - return ImplicitBufferPtr; - } - AMDGPUFunctionArgInfo &getArgInfo() { return ArgInfo; } diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -7,17 +7,18 @@ //===----------------------------------------------------------------------===// #include "SIMachineFunctionInfo.h" -#include "AMDGPUTargetMachine.h" #include "AMDGPUSubtarget.h" -#include "SIRegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MIRParser/MIParser.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" @@ -36,28 +37,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI) - : AMDGPUMachineFunction(F, *STI), - Mode(F), - GWSResourcePSV(getTM(STI)), - PrivateSegmentBuffer(false), - DispatchPtr(false), - QueuePtr(false), - KernargSegmentPtr(false), - DispatchID(false), - FlatScratchInit(false), - WorkGroupIDX(false), - WorkGroupIDY(false), - WorkGroupIDZ(false), - WorkGroupInfo(false), - LDSKernelId(false), - PrivateSegmentWaveByteOffset(false), - WorkItemIDX(false), - WorkItemIDY(false), - WorkItemIDZ(false), - ImplicitBufferPtr(false), - ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + : AMDGPUMachineFunction(F, *STI), Mode(F), GWSResourcePSV(getTM(STI)), + UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false), + WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false), + PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), + WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), + GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const GCNSubtarget &ST = *static_cast(STI); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); @@ -67,16 +52,10 @@ VRegFlags.reserve(1024); - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; if (IsKernel) { - if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0) - KernargSegmentPtr = true; WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { @@ -115,12 +94,6 @@ MayNeedAGPRs = false; // We will select all MAI with VGPR operands. } - bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (isAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - if (!AMDGPU::isGraphics(CC) || (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) { if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x")) @@ -145,33 +118,10 @@ ST.getMaxWorkitemID(F, 2) != 0) WorkItemIDZ = true; - if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) - DispatchPtr = true; - - if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) - QueuePtr = true; - - if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) - DispatchID = true; - if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id")) LDSKernelId = true; } - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && isEntryFunction() && - (isAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - if (isEntryFunction()) { // X, XY, and XYZ are the only supported combinations, so make sure Y is // enabled if Z is. diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernal-args-metadata.ll b/llvm/test/CodeGen/AMDGPU/preload-kernal-args-metadata.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-kernal-args-metadata.ll @@ -0,0 +1,647 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s + +; Metadata for each runline is at the bottom of the file. + +define amdgpu_kernel void @test_preload_metadata_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_preload_metadata_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_kernel void @test_preload_metadata_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !7 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; PRELOAD-8-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + %load2 = load i32, ptr addrspace(1) %in2 + %load3 = load i32, ptr addrspace(1) %in3 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + store i32 %load2, ptr addrspace(1) %out2 + store i32 %load3, ptr addrspace(1) %out3 + ret void +} + +; Preload args with inreg in the NO-PRELOAD case. + +define amdgpu_kernel void @test_preload_metadata_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. + +define amdgpu_kernel void @test_preload_metadata_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_kernel void @test_preload_metadata_kernel_4_misaligned(i16 %arg0, ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned +; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned +; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned +; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !8 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_misaligned +; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !9 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_MISALIGNED_KERNARG_SEGMENT]], i64 32 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[EXT]] +; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + %ext = zext i16 %arg0 to i32 + %add = add i32 %load, %ext + store i32 %add, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +; In this case both i16 args with be preloaded into the first SGPR. + +define amdgpu_kernel void @test_preload_metadata_kernel_4_i16_i16(i16 %arg0, i16 %arg1, ptr addrspace(1) %out) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 +; NO-PRELOAD-SAME: (i16 [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; NO-PRELOAD-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; NO-PRELOAD-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; NO-PRELOAD-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; NO-PRELOAD-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; NO-PRELOAD-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 +; PRELOAD-1-SAME: (i16 inreg [[ARG0:%.*]], i16 [[ARG1:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] !preload_kernel_args !3 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; PRELOAD-1-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; PRELOAD-1-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-1-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; PRELOAD-1-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-1-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 +; PRELOAD-3-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] !preload_kernel_args !10 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; PRELOAD-3-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; PRELOAD-3-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-3-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; PRELOAD-3-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-3-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_i16_i16 +; PRELOAD-8-SAME: (i16 inreg [[ARG0:%.*]], i16 inreg [[ARG1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0]] !preload_kernel_args !11 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +; PRELOAD-8-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 16 +; PRELOAD-8-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_I16_I16_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[EXT:%.*]] = zext i16 [[TMP2]] to i32 +; PRELOAD-8-NEXT: [[EXT1:%.*]] = zext i16 [[TMP5]] to i32 +; PRELOAD-8-NEXT: [[ADD:%.*]] = add i32 [[EXT]], [[EXT1]] +; PRELOAD-8-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %ext = zext i16 %arg0 to i32 + %ext1 = zext i16 %arg1 to i32 + %add = add i32 %ext, %ext1 + store i32 %add, ptr addrspace(1) %out + ret void +} + +attributes #0 = { nounwind } +;. +; NO-PRELOAD: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; NO-PRELOAD: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; PRELOAD-1: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; PRELOAD-1: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; PRELOAD-3: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; PRELOAD-3: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; PRELOAD-8: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; PRELOAD-8: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;. +; NO-PRELOAD: [[META0:![0-9]+]] = !{} +;. +; PRELOAD-1: [[META0:![0-9]+]] = !{!1} +; PRELOAD-1: [[META1:![0-9]+]] = !{i32 0, i32 2} +; PRELOAD-1: [[META2:![0-9]+]] = !{} +; PRELOAD-1: [[META3:![0-9]+]] = !{!4} +; PRELOAD-1: [[META4:![0-9]+]] = !{i32 0, i32 1} +;. +; PRELOAD-3: [[META0:![0-9]+]] = !{!1, !2} +; PRELOAD-3: [[META1:![0-9]+]] = !{i32 0, i32 2} +; PRELOAD-3: [[META2:![0-9]+]] = !{i32 1, i32 2} +; PRELOAD-3: [[META3:![0-9]+]] = !{} +; PRELOAD-3: [[META4:![0-9]+]] = !{!1, !2, !5} +; PRELOAD-3: [[META5:![0-9]+]] = !{i32 2, i32 2} +; PRELOAD-3: [[META6:![0-9]+]] = !{!1, !2, !5, !7} +; PRELOAD-3: [[META7:![0-9]+]] = !{i32 3, i32 2} +; PRELOAD-3: [[META8:![0-9]+]] = !{!9, !2, !5} +; PRELOAD-3: [[META9:![0-9]+]] = !{i32 0, i32 1} +; PRELOAD-3: [[META10:![0-9]+]] = !{!9, !11, !5} +; PRELOAD-3: [[META11:![0-9]+]] = !{i32 1, i32 1} +;. +; PRELOAD-8: [[META0:![0-9]+]] = !{!1, !2} +; PRELOAD-8: [[META1:![0-9]+]] = !{i32 0, i32 2} +; PRELOAD-8: [[META2:![0-9]+]] = !{i32 1, i32 2} +; PRELOAD-8: [[META3:![0-9]+]] = !{} +; PRELOAD-8: [[META4:![0-9]+]] = !{!1, !2, !5, !6} +; PRELOAD-8: [[META5:![0-9]+]] = !{i32 2, i32 2} +; PRELOAD-8: [[META6:![0-9]+]] = !{i32 3, i32 2} +; PRELOAD-8: [[META7:![0-9]+]] = !{!1, !2, !5, !6, !8} +; PRELOAD-8: [[META8:![0-9]+]] = !{i32 4, i32 2} +; PRELOAD-8: [[META9:![0-9]+]] = !{!10, !2, !5, !6} +; PRELOAD-8: [[META10:![0-9]+]] = !{i32 0, i32 1} +; PRELOAD-8: [[META11:![0-9]+]] = !{!10, !12, !5} +; PRELOAD-8: [[META12:![0-9]+]] = !{i32 1, i32 1} +;.