diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -13,10 +13,12 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-lower-kernel-arguments" @@ -24,6 +26,86 @@ namespace { +class PreloadKernelArgInfo { +private: + Function &F; + + const GCNSubtarget &ST; + + unsigned NumFreeUserSGPRs; + +public: + SmallVector KernelArgMetadata; + + PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) { + setInitialFreeUserSGPRsCount(); + } + + // Returns the maximum number of user SGPRs that we have available to preload + // arguments. + void setInitialFreeUserSGPRsCount() { + const Module *M = F.getParent(); + + const unsigned MaxUserSGRPs = 16; + unsigned NumUserSGPRs = 0; + + const CallingConv::ID CC = F.getCallingConv(); + const bool IsKernel = + CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + + // ImplicitBufferPtr + if (ST.isMesaGfxShader(F)) + NumUserSGPRs += 2; + + // PrivateSegmentBuffer + if (ST.isAmdHsaOrMesa(F) && !ST.enableFlatScratch()) + NumUserSGPRs += 4; + + // DispatchPtr + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) + NumUserSGPRs += 2; + + // QueuePtr + if (!F.hasFnAttribute("amdgpu-no-queue-ptr") && + AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { + NumUserSGPRs += 2; + } + + // KernargSegmentPtr + if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) + NumUserSGPRs += 2; + + // DispatchID + if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) + NumUserSGPRs += 2; + + // FlatScratchInit + if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && + (ST.isAmdHsaOrMesa(F) || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + NumUserSGPRs += 2; + } + + // LDSKernelId + if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id")) + NumUserSGPRs += 1; + + NumFreeUserSGPRs = MaxUserSGRPs - NumUserSGPRs; + } + + bool allocPreloadArg(unsigned AllocSize) { + unsigned AllocSizeDword = alignTo(AllocSize, 4) / 4; + if (AllocSizeDword > NumFreeUserSGPRs) + return false; + + NumFreeUserSGPRs -= AllocSizeDword; + return true; + } +}; + class AMDGPULowerKernelArguments : public FunctionPass{ public: static char ID; @@ -87,8 +169,14 @@ Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); uint64_t ExplicitArgOffset = 0; + // Preloaded kernel arguments must be sequential. + bool InPreloadSequence = false; + bool HasPreloadArgs = false; + PreloadKernelArgInfo PreloadInfo(F, ST); + int Idx = -1; for (Argument &Arg : F.args()) { + Idx++; const bool IsByRef = Arg.hasByRefAttr(); Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt; @@ -103,6 +191,19 @@ if (Arg.use_empty()) continue; + if (Arg.hasInRegAttr() && (InPreloadSequence || !HasPreloadArgs) && + PreloadInfo.allocPreloadArg(AllocSize)) { + // Preload this argument. + InPreloadSequence = true; + HasPreloadArgs = true; + MDBuilder MDB(Ctx); + PreloadInfo.KernelArgMetadata.push_back( + llvm::MDNode::get(Ctx, MDB.createConstant(llvm::ConstantInt::get( + Builder.getInt32Ty(), Idx)))); + } else if (InPreloadSequence) { + InPreloadSequence = false; + } + // If this is byval, the loads are already explicit in the function. We just // need to rewrite the pointer values. if (IsByRef) { @@ -226,6 +327,11 @@ } } + if (HasPreloadArgs) { + F.setMetadata("preload_kernel_args", + llvm::MDNode::get(Ctx, PreloadInfo.KernelArgMetadata)); + } + KernArgSegment->addRetAttr( Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign))); diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernal-args-metadata.ll b/llvm/test/CodeGen/AMDGPU/preload-kernal-args-metadata.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preload-kernal-args-metadata.ll @@ -0,0 +1,451 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -S < %s | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=1 -S < %s | FileCheck -check-prefix=PRELOAD-1 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=3 -S < %s | FileCheck -check-prefix=PRELOAD-3 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -amdgpu-lower-kernel-arguments -amdgpu-kernarg-preload-count=8 -S < %s | FileCheck -check-prefix=PRELOAD-8 %s + +; Metadata for each runline is at the bottom of the file. + +define amdgpu_kernel void @test_preload_metadata_kernel_2(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_2 +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[OUT:%.*]]) #[[ATTR0:[0-9]+]] !preload_kernel_args !0 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_2_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + store i32 %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_preload_metadata_kernel_4(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %out, ptr addrspace(1) %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4 +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +define amdgpu_kernel void @test_preload_metadata_kernel_8(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3, ptr addrspace(1) %out, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; NO-PRELOAD-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; NO-PRELOAD-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; PRELOAD-1-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; PRELOAD-1-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) [[IN3:%.*]], ptr addrspace(1) [[OUT:%.*]], ptr addrspace(1) [[OUT1:%.*]], ptr addrspace(1) [[OUT2:%.*]], ptr addrspace(1) [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; PRELOAD-3-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; PRELOAD-3-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_8 +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[IN2:%.*]], ptr addrspace(1) inreg [[IN3:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]], ptr addrspace(1) inreg [[OUT2:%.*]], ptr addrspace(1) inreg [[OUT3:%.*]]) #[[ATTR0]] !preload_kernel_args !7 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[IN2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[IN3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 32 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 40 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT2_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 48 +; PRELOAD-8-NEXT: [[OUT2_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT2_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT3_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_8_KERNARG_SEGMENT]], i64 56 +; PRELOAD-8-NEXT: [[OUT3_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT3_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD2:%.*]] = load i32, ptr addrspace(1) [[IN2_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD3:%.*]] = load i32, ptr addrspace(1) [[IN3_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD2]], ptr addrspace(1) [[OUT2_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD3]], ptr addrspace(1) [[OUT3_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + %load2 = load i32, ptr addrspace(1) %in2 + %load3 = load i32, ptr addrspace(1) %in3 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + store i32 %load2, ptr addrspace(1) %out2 + store i32 %load3, ptr addrspace(1) %out3 + ret void +} + +; Preload args with inreg in the NO-PRELOAD case. + +define amdgpu_kernel void @test_preload_metadata_kernel_4_inreg_offset(ptr addrspace(1) %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; NO-PRELOAD-SAME: (ptr addrspace(1) [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !1 { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +; Only preload the first sequence of arguments with the inreg attribute. In the NO-PRELOAD case this is just the first argument. + +define amdgpu_kernel void @test_preload_metadata_kernel_4_inreg_offset_two_sequence(ptr addrspace(1) inreg %in, ptr addrspace(1) %in1, ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %out1) #0 { +; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; NO-PRELOAD-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; NO-PRELOAD-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; NO-PRELOAD-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; NO-PRELOAD-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; NO-PRELOAD-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; NO-PRELOAD-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !0 +; NO-PRELOAD-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; NO-PRELOAD-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !0 +; NO-PRELOAD-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; NO-PRELOAD-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; NO-PRELOAD-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; NO-PRELOAD-NEXT: ret void +; +; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; PRELOAD-1-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !0 { +; PRELOAD-1-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-1-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; PRELOAD-1-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; PRELOAD-1-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; PRELOAD-1-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !2 +; PRELOAD-1-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; PRELOAD-1-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !2 +; PRELOAD-1-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-1-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-1-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-1-NEXT: ret void +; +; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; PRELOAD-3-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !6 { +; PRELOAD-3-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-3-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; PRELOAD-3-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; PRELOAD-3-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; PRELOAD-3-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-3-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; PRELOAD-3-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-3-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-3-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-3-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-3-NEXT: ret void +; +; PRELOAD-8-LABEL: define {{[^@]+}}@test_preload_metadata_kernel_4_inreg_offset_two_sequence +; PRELOAD-8-SAME: (ptr addrspace(1) inreg [[IN:%.*]], ptr addrspace(1) inreg [[IN1:%.*]], ptr addrspace(1) inreg [[OUT:%.*]], ptr addrspace(1) inreg [[OUT1:%.*]]) #[[ATTR0]] !preload_kernel_args !4 { +; PRELOAD-8-NEXT: [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() +; PRELOAD-8-NEXT: [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 0 +; PRELOAD-8-NEXT: [[IN_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[IN1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 8 +; PRELOAD-8-NEXT: [[IN1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[IN1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 16 +; PRELOAD-8-NEXT: [[OUT_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT_KERNARG_OFFSET]], align 16, !invariant.load !3 +; PRELOAD-8-NEXT: [[OUT1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TEST_PRELOAD_METADATA_KERNEL_4_INREG_OFFSET_TWO_SEQUENCE_KERNARG_SEGMENT]], i64 24 +; PRELOAD-8-NEXT: [[OUT1_LOAD:%.*]] = load ptr addrspace(1), ptr addrspace(4) [[OUT1_KERNARG_OFFSET]], align 8, !invariant.load !3 +; PRELOAD-8-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[IN_LOAD]], align 4 +; PRELOAD-8-NEXT: [[LOAD1:%.*]] = load i32, ptr addrspace(1) [[IN1_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD]], ptr addrspace(1) [[OUT_LOAD]], align 4 +; PRELOAD-8-NEXT: store i32 [[LOAD1]], ptr addrspace(1) [[OUT1_LOAD]], align 4 +; PRELOAD-8-NEXT: ret void +; + %load = load i32, ptr addrspace(1) %in + %load1 = load i32, ptr addrspace(1) %in1 + store i32 %load, ptr addrspace(1) %out + store i32 %load1, ptr addrspace(1) %out1 + ret void +} + +attributes #0 = { nounwind } + +; Metadata + +; NO-PRELOAD: !0 = !{} +; NO-PRELOAD-NEXT: !1 = !{!2, !3} +; NO-PRELOAD-NEXT: !2 = !{i32 2} +; NO-PRELOAD-NEXT: !3 = !{i32 3} +; NO-PRELOAD-NEXT: !4 = !{!5} +; NO-PRELOAD-NEXT: !5 = !{i32 0} + +; PRELOAD-1: 0 = !{!1} +; PRELOAD-1-NEXT: !1 = !{i32 0} +; PRELOAD-1-NEXT: !2 = !{} + +; PRELOAD-3: !0 = !{!1, !2} +; PRELOAD-3-NEXT: !1 = !{i32 0} +; PRELOAD-3-NEXT: !2 = !{i32 1} +; PRELOAD-3-NEXT: !3 = !{} +; PRELOAD-3-NEXT: !4 = !{!1, !2, !5} +; PRELOAD-3-NEXT: !5 = !{i32 2} +; PRELOAD-3-NEXT: !6 = !{!1, !2, !5, !7} +; PRELOAD-3-NEXT: !7 = !{i32 3} + +; PRELOAD-8: !0 = !{!1, !2} +; PREALOD-8-NEXT: !1 = !{i32 0} +; PREALOD-8-NEXT: !2 = !{i32 1} +; PREALOD-8-NEXT: !3 = !{!1, !2, !4, !5} +; PREALOD-8-NEXT: !4 = !{i32 2} +; PREALOD-8-NEXT: !5 = !{i32 3} +; PREALOD-8-NEXT: !6 = !{!1, !2, !4, !5, !7} +; PREALOD-8-NEXT: !7 = !{i32 4} +; PREALOD-8-NEXT: !8 = !{}